diff options
Diffstat (limited to 'intern')
481 files changed, 33197 insertions, 42361 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 381248e9bf1..17096d441f0 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -247,7 +247,7 @@ if(WITH_CYCLES_OSL) endif() if(WITH_CYCLES_DEVICE_OPTIX) - find_package(OptiX) + find_package(OptiX 7.3.0) if(OPTIX_FOUND) add_definitions(-DWITH_OPTIX) @@ -286,11 +286,17 @@ if(WITH_OPENSUBDIV) ) endif() +if(WITH_OPENIMAGEDENOISE) + add_definitions(-DWITH_OPENIMAGEDENOISE) + add_definitions(-DOIDN_STATIC_LIB) + include_directories( + SYSTEM + ${OPENIMAGEDENOISE_INCLUDE_DIRS} + ) +endif() + if(WITH_CYCLES_STANDALONE) - set(WITH_CYCLES_DEVICE_OPENCL TRUE) set(WITH_CYCLES_DEVICE_CUDA TRUE) - # Experimental and unfinished. - set(WITH_CYCLES_NETWORK FALSE) endif() # TODO(sergey): Consider removing it, only causes confusion in interface. set(WITH_CYCLES_DEVICE_MULTI TRUE) @@ -386,18 +392,12 @@ if(WITH_CYCLES_BLENDER) add_subdirectory(blender) endif() -if(WITH_CYCLES_NETWORK) - add_definitions(-DWITH_NETWORK) -endif() - -if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER) - add_subdirectory(app) -endif() - +add_subdirectory(app) add_subdirectory(bvh) add_subdirectory(device) add_subdirectory(doc) add_subdirectory(graph) +add_subdirectory(integrator) add_subdirectory(kernel) add_subdirectory(render) add_subdirectory(subd) diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 7a1e5d62dd2..f9dc5f00802 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -91,24 +91,6 @@ if(WITH_CYCLES_STANDALONE) endif() ##################################################################### -# Cycles network server executable -##################################################################### - -if(WITH_CYCLES_NETWORK) - set(SRC - cycles_server.cpp - ) - add_executable(cycles_server ${SRC}) - target_link_libraries(cycles_server ${LIBRARIES}) - cycles_target_link_libraries(cycles_server) - - if(UNIX AND NOT APPLE) - set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib) - endif() - unset(SRC) -endif() - -##################################################################### # Cycles cubin compiler executable ##################################################################### diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp index 6b3513b065a..270096d70b0 100644 --- a/intern/cycles/app/cycles_standalone.cpp +++ b/intern/cycles/app/cycles_standalone.cpp @@ -126,7 +126,7 @@ static BufferParams &session_buffer_params() static void scene_init() { - options.scene = new Scene(options.scene_params, options.session->device); + options.scene = options.session->scene; /* Read XML */ xml_read_file(options.scene, options.filepath.c_str()); @@ -148,7 +148,7 @@ static void scene_init() static void session_init() { options.session_params.write_render_cb = write_render; - options.session = new Session(options.session_params); + options.session = new Session(options.session_params, options.scene_params); if (options.session_params.background && !options.quiet) options.session->progress.set_update_callback(function_bind(&session_print_status)); @@ -159,7 +159,6 @@ static void session_init() /* load scene */ scene_init(); - options.session->scene = options.scene; options.session->reset(session_buffer_params(), options.session_params.samples); options.session->start(); @@ -527,9 +526,6 @@ static void options_parse(int argc, const char **argv) fprintf(stderr, "No file path specified\n"); exit(EXIT_FAILURE); } - - /* For smoother Viewport */ - options.session_params.start_resolution = 64; } CCL_NAMESPACE_END diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index 276d850f1b3..54f97fddbd9 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -703,7 +703,7 @@ void xml_read_file(Scene *scene, const char *filepath) xml_read_include(state, path_filename(filepath)); - scene->params.bvh_type = SceneParams::BVH_STATIC; + scene->params.bvh_type = BVH_TYPE_STATIC; } CCL_NAMESPACE_END diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index ee5c6157338..5bdcfd56a4d 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -33,6 +33,7 @@ set(SRC blender_device.cpp blender_image.cpp blender_geometry.cpp + blender_gpu_display.cpp blender_light.cpp blender_mesh.cpp blender_object.cpp @@ -50,6 +51,7 @@ set(SRC CCL_api.h blender_device.h + blender_gpu_display.h blender_id_map.h blender_image.h blender_object_cull.h @@ -93,14 +95,6 @@ set(ADDON_FILES add_definitions(${GL_DEFINITIONS}) -if(WITH_CYCLES_DEVICE_OPENCL) - add_definitions(-DWITH_OPENCL) -endif() - -if(WITH_CYCLES_NETWORK) - add_definitions(-DWITH_NETWORK) -endif() - if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index f728050a3cf..1ce25a253f9 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -58,7 +58,6 @@ class CyclesRender(bpy.types.RenderEngine): bl_use_eevee_viewport = True bl_use_preview = True bl_use_exclude_layers = True - bl_use_save_buffers = True bl_use_spherical_stereo = True bl_use_custom_freestyle = True bl_use_alembic_procedural = True @@ -85,6 +84,12 @@ class CyclesRender(bpy.types.RenderEngine): def render(self, depsgraph): engine.render(self, depsgraph) + def render_frame_finish(self): + engine.render_frame_finish(self) + + def draw(self, context, depsgraph): + engine.draw(self, depsgraph, context.space_data) + def bake(self, depsgraph, obj, pass_type, pass_filter, width, height): engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height) @@ -98,7 +103,7 @@ class CyclesRender(bpy.types.RenderEngine): engine.sync(self, depsgraph, context.blend_data) def view_draw(self, context, depsgraph): - engine.draw(self, depsgraph, context.region, context.space_data, context.region_data) + engine.view_draw(self, depsgraph, context.region, context.space_data, context.region_data) def update_script_node(self, node): if engine.with_osl(): diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 489a883f098..e0e8ca10bef 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -18,62 +18,17 @@ from __future__ import annotations -def _is_using_buggy_driver(): - import gpu - # We need to be conservative here because in multi-GPU systems display card - # might be quite old, but others one might be just good. - # - # So We shouldn't disable possible good dedicated cards just because display - # card seems weak. And instead we only blacklist configurations which are - # proven to cause problems. - if gpu.platform.vendor_get() == "ATI Technologies Inc.": - import re - version = gpu.platform.version_get() - if version.endswith("Compatibility Profile Context"): - # Old HD 4xxx and 5xxx series drivers did not have driver version - # in the version string, but those cards do not quite work and - # causing crashes. - return True - regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$") - if not regex.match(version): - # Skip cards like FireGL - return False - version = regex.sub("\\1", version).split('.') - return int(version[0]) == 8 - return False - - -def _workaround_buggy_drivers(): - if _is_using_buggy_driver(): - import _cycles - if hasattr(_cycles, "opencl_disable"): - print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.") - _cycles.opencl_disable() - - def _configure_argument_parser(): import argparse # No help because it conflicts with general Python scripts argument parsing parser = argparse.ArgumentParser(description="Cycles Addon argument parser", add_help=False) - parser.add_argument("--cycles-resumable-num-chunks", - help="Number of chunks to split sample range into", - default=None) - parser.add_argument("--cycles-resumable-current-chunk", - help="Current chunk of samples range to render", - default=None) - parser.add_argument("--cycles-resumable-start-chunk", - help="Start chunk to render", - default=None) - parser.add_argument("--cycles-resumable-end-chunk", - help="End chunk to render", - default=None) parser.add_argument("--cycles-print-stats", help="Print rendering statistics to stderr", action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA', 'OPTIX' or 'OPENCL'." + "Valid options are 'CPU', 'CUDA' or 'OPTIX'." "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser @@ -89,21 +44,6 @@ def _parse_command_line(): parser = _configure_argument_parser() args, _ = parser.parse_known_args(argv[argv.index("--") + 1:]) - if args.cycles_resumable_num_chunks is not None: - if args.cycles_resumable_current_chunk is not None: - import _cycles - _cycles.set_resumable_chunk( - int(args.cycles_resumable_num_chunks), - int(args.cycles_resumable_current_chunk), - ) - elif args.cycles_resumable_start_chunk is not None and \ - args.cycles_resumable_end_chunk: - import _cycles - _cycles.set_resumable_chunk_range( - int(args.cycles_resumable_num_chunks), - int(args.cycles_resumable_start_chunk), - int(args.cycles_resumable_end_chunk), - ) if args.cycles_print_stats: import _cycles _cycles.enable_print_stats() @@ -118,23 +58,11 @@ def init(): import _cycles import os.path - # Workaround possibly buggy legacy drivers which crashes on the OpenCL - # device enumeration. - # - # This checks are not really correct because they might still fail - # in the case of multiple GPUs. However, currently buggy drivers - # are really old and likely to be used in single GPU systems only - # anyway. - # - # Can't do it in the background mode, so we hope OpenCL is no enabled - # in the user preferences. - if not bpy.app.background: - _workaround_buggy_drivers() - path = os.path.dirname(__file__) user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path=''))) + temp_path = bpy.app.tempdir - _cycles.init(path, user_path, bpy.app.background) + _cycles.init(path, user_path, temp_path, bpy.app.background) _parse_command_line() @@ -177,6 +105,25 @@ def render(engine, depsgraph): _cycles.render(engine.session, depsgraph.as_pointer()) +def render_frame_finish(engine): + if not engine.session: + return + + import _cycles + _cycles.render_frame_finish(engine.session) + +def draw(engine, depsgraph, space_image): + if not engine.session: + return + + depsgraph_ptr = depsgraph.as_pointer() + space_image_ptr = space_image.as_pointer() + screen_ptr = space_image.id_data.as_pointer() + + import _cycles + _cycles.draw(engine.session, depsgraph_ptr, screen_ptr, space_image_ptr) + + def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height): import _cycles session = getattr(engine, "session", None) @@ -204,14 +151,14 @@ def sync(engine, depsgraph, data): _cycles.sync(engine.session, depsgraph.as_pointer()) -def draw(engine, depsgraph, region, v3d, rv3d): +def view_draw(engine, depsgraph, region, v3d, rv3d): import _cycles depsgraph = depsgraph.as_pointer() v3d = v3d.as_pointer() rv3d = rv3d.as_pointer() # draw render image - _cycles.draw(engine.session, depsgraph, v3d, rv3d) + _cycles.view_draw(engine.session, depsgraph, v3d, rv3d) def available_devices(): @@ -224,11 +171,6 @@ def with_osl(): return _cycles.with_osl -def with_network(): - import _cycles - return _cycles.with_network - - def system_info(): import _cycles return _cycles.system_info() @@ -243,6 +185,7 @@ def list_render_passes(scene, srl): # Data passes. if srl.use_pass_z: yield ("Depth", "Z", 'VALUE') if srl.use_pass_mist: yield ("Mist", "Z", 'VALUE') + if srl.use_pass_position: yield ("Position", "XYZ", 'VECTOR') if srl.use_pass_normal: yield ("Normal", "XYZ", 'VECTOR') if srl.use_pass_vector: yield ("Vector", "XYZW", 'VECTOR') if srl.use_pass_uv: yield ("UV", "UVA", 'VECTOR') @@ -265,6 +208,7 @@ def list_render_passes(scene, srl): if srl.use_pass_environment: yield ("Env", "RGB", 'COLOR') if srl.use_pass_shadow: yield ("Shadow", "RGB", 'COLOR') if srl.use_pass_ambient_occlusion: yield ("AO", "RGB", 'COLOR') + if crl.use_pass_shadow_catcher: yield ("Shadow Catcher", "RGB", 'COLOR') # Debug passes. if crl.pass_debug_render_time: yield ("Debug Render Time", "X", 'VALUE') @@ -283,30 +227,20 @@ def list_render_passes(scene, srl): yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR') # Denoising passes. - if (scene.cycles.use_denoising and crl.use_denoising) or crl.denoising_store_passes: + if scene.cycles.use_denoising and crl.use_denoising: yield ("Noisy Image", "RGBA", 'COLOR') - if crl.denoising_store_passes: - yield ("Denoising Normal", "XYZ", 'VECTOR') - yield ("Denoising Albedo", "RGB", 'COLOR') - yield ("Denoising Depth", "Z", 'VALUE') - - if scene.cycles.denoiser == 'NLM': - yield ("Denoising Shadowing", "X", 'VALUE') - yield ("Denoising Variance", "RGB", 'COLOR') - yield ("Denoising Intensity", "X", 'VALUE') - - clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect", - "denoising_glossy_direct", "denoising_glossy_indirect", - "denoising_transmission_direct", "denoising_transmission_indirect") - if any(getattr(crl, option) for option in clean_options): - yield ("Denoising Clean", "RGB", 'COLOR') + if crl.use_pass_shadow_catcher: + yield ("Noisy Shadow Catcher", "RGBA", 'COLOR') + if crl.denoising_store_passes: + yield ("Denoising Normal", "XYZ", 'VECTOR') + yield ("Denoising Albedo", "RGB", 'COLOR') # Custom AOV passes. for aov in srl.aovs: if aov.type == 'VALUE': yield (aov.name, "X", 'VALUE') else: - yield (aov.name, "RGBA", 'COLOR') + yield (aov.name, "RGB", 'COLOR') def register_passes(engine, scene, view_layer): diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index bf33e5dc010..37c39904e30 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -60,32 +60,48 @@ class AddPresetSampling(AddPresetBase, Operator): ] preset_values = [ + "cycles.use_adaptive_sampling", "cycles.samples", - "cycles.preview_samples", - "cycles.aa_samples", - "cycles.preview_aa_samples", - "cycles.diffuse_samples", - "cycles.glossy_samples", - "cycles.transmission_samples", - "cycles.ao_samples", - "cycles.mesh_light_samples", - "cycles.subsurface_samples", - "cycles.volume_samples", - "cycles.use_square_samples", - "cycles.progressive", - "cycles.seed", - "cycles.sample_clamp_direct", - "cycles.sample_clamp_indirect", - "cycles.sample_all_lights_direct", - "cycles.sample_all_lights_indirect", + "cycles.adaptive_threshold", + "cycles.adaptive_min_samples", + "cycles.time_limit", + "cycles.use_denoising", + "cycles.denoiser", + "cycles.denoising_input_passes", + "cycles.denoising_prefilter", ] preset_subdir = "cycles/sampling" +class AddPresetViewportSampling(AddPresetBase, Operator): + '''Add a Viewport Sampling Preset''' + bl_idname = "render.cycles_viewport_sampling_preset_add" + bl_label = "Add Viewport Sampling Preset" + preset_menu = "CYCLES_PT_viewport_sampling_presets" + + preset_defines = [ + "cycles = bpy.context.scene.cycles" + ] + + preset_values = [ + "cycles.use_preview_adaptive_sampling", + "cycles.preview_samples", + "cycles.preview_adaptive_threshold", + "cycles.preview_adaptive_min_samples", + "cycles.use_preview_denoising", + "cycles.preview_denoiser", + "cycles.preview_denoising_input_passes", + "cycles.preview_denoising_prefilter", + "cycles.preview_denoising_start_sample", + ] + + preset_subdir = "cycles/viewport_sampling" + classes = ( AddPresetIntegrator, AddPresetSampling, + AddPresetViewportSampling, ) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 0c3af3fabeb..c2570e71efd 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -39,11 +39,6 @@ enum_devices = ( ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"), ) -from _cycles import with_network -if with_network: - enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),) -del with_network - enum_feature_set = ( ('SUPPORTED', "Supported", "Only use finished and supported features"), ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1), @@ -84,15 +79,6 @@ enum_curve_shape = ( ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"), ) -enum_tile_order = ( - ('CENTER', "Center", "Render from center to the edges"), - ('RIGHT_TO_LEFT', "Right to Left", "Render from right to left"), - ('LEFT_TO_RIGHT', "Left to Right", "Render from left to right"), - ('TOP_TO_BOTTOM', "Top to Bottom", "Render from top to bottom"), - ('BOTTOM_TO_TOP', "Bottom to Top", "Render from bottom to top"), - ('HILBERT_SPIRAL', "Hilbert Spiral", "Render in a Hilbert Spiral"), -) - enum_use_layer_samples = ( ('USE', "Use", "Per render layer number of samples override scene samples"), ('BOUNDED', "Bounded", "Bound per render layer number of samples by global samples"), @@ -101,15 +87,9 @@ enum_use_layer_samples = ( enum_sampling_pattern = ( ('SOBOL', "Sobol", "Use Sobol random sampling pattern"), - ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"), ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"), ) -enum_integrator = ( - ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"), - ('PATH', "Path Tracing", "Pure path tracing integrator"), -) - enum_volume_sampling = ( ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"), ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"), @@ -131,7 +111,6 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), - ('OPENCL', "OpenCL", "OpenCL", 2) ) enum_texture_limit = ( @@ -144,39 +123,46 @@ enum_texture_limit = ( ('4096', "4096", "Limit texture size to 4096 pixels", 6), ('8192', "8192", "Limit texture size to 8192 pixels", 7), ) - + +# NOTE: Identifiers are expected to be an upper case version of identifiers from `Pass::get_type_enum()` enum_view3d_shading_render_pass = ( ('', "General", ""), - ('COMBINED', "Combined", "Show the Combined Render pass", 1), - ('EMISSION', "Emission", "Show the Emission render pass", 33), - ('BACKGROUND', "Background", "Show the Background render pass", 34), - ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35), + ('COMBINED', "Combined", "Show the Combined Render pass"), + ('EMISSION', "Emission", "Show the Emission render pass"), + ('BACKGROUND', "Background", "Show the Background render pass"), + ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"), + ('SHADOW', "Shadow", "Show the Shadow render pass"), + ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"), ('', "Light", ""), - ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38), - ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39), - ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40), + ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass"), + ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass"), + ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass"), - ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41), - ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42), - ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43), + ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass"), + ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass"), + ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass"), ('', "", ""), - ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44), - ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45), - ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46), + ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass"), + ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass"), + ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass"), - ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50), - ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51), + ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass"), + ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass"), ('', "Data", ""), - ('NORMAL', "Normal", "Show the Normal render pass", 3), - ('UV', "UV", "Show the UV render pass", 4), - ('MIST', "Mist", "Show the Mist render pass", 32), + ('POSITION', "Position", "Show the Position render pass"), + ('NORMAL', "Normal", "Show the Normal render pass"), + ('UV', "UV", "Show the UV render pass"), + ('MIST', "Mist", "Show the Mist render pass"), + ('DENOISING_ALBEDO', "Denoising Albedo", "Albedo pass used by denoiser"), + ('DENOISING_NORMAL', "Denoising Normal", "Normal pass used by denoiser"), + ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"), ) @@ -208,18 +194,23 @@ def enum_preview_denoiser(self, context): def enum_denoiser(self, context): - items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)] + items = [] items += enum_optix_denoiser(self, context) items += enum_openimagedenoise_denoiser(self, context) return items enum_denoising_input_passes = ( - ('RGB', "Color", "Use only color as input", 1), - ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2), - ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3), + ('RGB', "None", "Don't use utility passes for denoising", 1), + ('RGB_ALBEDO', "Albedo", "Use albedo pass for denoising", 2), + ('RGB_ALBEDO_NORMAL', "Albedo and Normal", "Use albedo and normal passes for denoising", 3), ) +enum_denoising_prefilter = ( + ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1), + ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2), + ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3), +) def update_render_passes(self, context): scene = context.scene @@ -252,13 +243,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): description="Use Open Shading Language (CPU rendering only)", ) - progressive: EnumProperty( - name="Integrator", - description="Method to sample lights and materials", - items=enum_integrator, - default='PATH', - ) - preview_pause: BoolProperty( name="Pause Preview", description="Pause all viewport preview renders", @@ -268,110 +252,88 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): use_denoising: BoolProperty( name="Use Denoising", description="Denoise the rendered image", - default=False, + default=True, update=update_render_passes, ) - use_preview_denoising: BoolProperty( - name="Use Viewport Denoising", - description="Denoise the image in the 3D viewport", - default=False, - ) - denoiser: EnumProperty( name="Denoiser", description="Denoise the image with the selected denoiser. " - "For denoising the image after rendering, denoising data render passes " - "also adapt to the selected denoiser", + "For denoising the image after rendering", items=enum_denoiser, - default=1, + default=4, # Use integer to avoid error in builds without OpenImageDenoise. update=update_render_passes, ) + denoising_prefilter: EnumProperty( + name="Denoising Prefilter", + description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser", + items=enum_denoising_prefilter, + default='ACCURATE', + ) + denoising_input_passes: EnumProperty( + name="Denoising Input Passes", + description="Passes used by the denoiser to distinguish noise from shader and geometry detail", + items=enum_denoising_input_passes, + default='RGB_ALBEDO_NORMAL', + ) + + use_preview_denoising: BoolProperty( + name="Use Viewport Denoising", + description="Denoise the image in the 3D viewport", + default=False, + ) preview_denoiser: EnumProperty( name="Viewport Denoiser", description="Denoise the image after each preview update with the selected denoiser", items=enum_preview_denoiser, default=0, ) - - use_square_samples: BoolProperty( - name="Square Samples", - description="Square sampling values for easier artist control", - default=False, + preview_denoising_prefilter: EnumProperty( + name="Viewport Denoising Prefilter", + description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser", + items=enum_denoising_prefilter, + default='FAST', + ) + preview_denoising_input_passes: EnumProperty( + name="Viewport Denoising Input Passes", + description="Passes used by the denoiser to distinguish noise from shader and geometry detail", + items=enum_denoising_input_passes, + default='RGB_ALBEDO', + ) + preview_denoising_start_sample: IntProperty( + name="Start Denoising", + description="Sample to start denoising the preview at", + min=0, max=(1 << 24), + default=1, ) samples: IntProperty( name="Samples", description="Number of samples to render for each pixel", min=1, max=(1 << 24), - default=128, + default=4096, ) preview_samples: IntProperty( name="Viewport Samples", description="Number of samples to render in the viewport, unlimited if 0", min=0, max=(1 << 24), - default=32, - ) - aa_samples: IntProperty( - name="AA Samples", - description="Number of antialiasing samples to render for each pixel", - min=1, max=2097151, - default=128, - ) - preview_aa_samples: IntProperty( - name="AA Samples", - description="Number of antialiasing samples to render in the viewport, unlimited if 0", - min=0, max=2097151, - default=32, + default=1024, ) - diffuse_samples: IntProperty( - name="Diffuse Samples", - description="Number of diffuse bounce samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - glossy_samples: IntProperty( - name="Glossy Samples", - description="Number of glossy bounce samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - transmission_samples: IntProperty( - name="Transmission Samples", - description="Number of transmission bounce samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - ao_samples: IntProperty( - name="Ambient Occlusion Samples", - description="Number of ambient occlusion samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - mesh_light_samples: IntProperty( - name="Mesh Light Samples", - description="Number of mesh emission light samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - subsurface_samples: IntProperty( - name="Subsurface Samples", - description="Number of subsurface scattering samples to render for each AA sample", - min=1, max=1024, - default=1, - ) - volume_samples: IntProperty( - name="Volume Samples", - description="Number of volume scattering samples to render for each AA sample", - min=1, max=1024, - default=1, + time_limit: FloatProperty( + name="Time Limit", + description="Limit the render time (excluding synchronization time)." + "Zero disables the limit", + min=0.0, + default=0.0, + step=100.0, + unit='TIME_ABSOLUTE', ) sampling_pattern: EnumProperty( name="Sampling Pattern", description="Random sampling pattern used by the integrator", items=enum_sampling_pattern, - default='SOBOL', + default='PROGRESSIVE_MUTI_JITTER', ) use_layer_samples: EnumProperty( @@ -381,17 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default='USE', ) - sample_all_lights_direct: BoolProperty( - name="Sample All Direct Lights", - description="Sample all lights (for direct samples), rather than randomly picking one", - default=True, - ) - - sample_all_lights_indirect: BoolProperty( - name="Sample All Indirect Lights", - description="Sample all lights (for indirect samples), rather than randomly picking one", - default=True, - ) light_sampling_threshold: FloatProperty( name="Light Sampling Threshold", description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). " @@ -403,19 +354,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): use_adaptive_sampling: BoolProperty( name="Use Adaptive Sampling", description="Automatically reduce the number of samples per pixel based on estimated noise level", - default=False, + default=True, ) - adaptive_threshold: FloatProperty( name="Adaptive Sampling Threshold", description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples", min=0.0, max=1.0, - default=0.0, + soft_min=0.001, + default=0.01, precision=4, ) adaptive_min_samples: IntProperty( name="Adaptive Min Samples", - description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples", + description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold", + min=0, max=4096, + default=0, + ) + + use_preview_adaptive_sampling: BoolProperty( + name="Use Adaptive Sampling", + description="Automatically reduce the number of samples per pixel based on estimated noise level, for viewport renders", + default=True, + ) + preview_adaptive_threshold: FloatProperty( + name="Adaptive Sampling Threshold", + description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders", + min=0.0, max=1.0, + soft_min=0.001, + default=0.1, + precision=4, + ) + preview_adaptive_min_samples: IntProperty( + name="Adaptive Min Samples", + description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders", min=0, max=4096, default=0, ) @@ -632,53 +603,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=10.0, ) - debug_tile_size: IntProperty( - name="Tile Size", - description="", - min=1, max=4096, - default=1024, - ) - - preview_start_resolution: IntProperty( - name="Start Resolution", - description="Resolution to start rendering preview at, " - "progressively increasing it to the full viewport size", - min=8, max=16384, - default=64, - subtype='PIXEL' - ) - preview_denoising_start_sample: IntProperty( - name="Start Denoising", - description="Sample to start denoising the preview at", - min=0, max=(1 << 24), - default=1, - ) - preview_denoising_input_passes: EnumProperty( - name="Viewport Input Passes", - description="Passes used by the denoiser to distinguish noise from shader and geometry detail", - items=enum_denoising_input_passes, - default='RGB_ALBEDO', - ) - - debug_reset_timeout: FloatProperty( - name="Reset timeout", - description="", - min=0.01, max=10.0, - default=0.1, - ) - debug_cancel_timeout: FloatProperty( - name="Cancel timeout", - description="", - min=0.01, max=10.0, - default=0.1, - ) - debug_text_timeout: FloatProperty( - name="Text timeout", - description="", - min=0.01, max=10.0, - default=1.0, - ) - debug_bvh_type: EnumProperty( name="Viewport BVH Type", description="Choose between faster updates, or faster render", @@ -701,38 +625,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0, min=0, max=16, ) - tile_order: EnumProperty( - name="Tile Order", - description="Tile order for rendering", - items=enum_tile_order, - default='HILBERT_SPIRAL', - options=set(), # Not animatable! - ) - use_progressive_refine: BoolProperty( - name="Progressive Refine", - description="Instead of rendering each tile until it is finished, " - "refine the whole image progressively " - "(this renders somewhat slower, " - "but time can be saved by manually stopping the render when the noise is low enough)", - default=False, - ) bake_type: EnumProperty( name="Bake Type", default='COMBINED', description="Type of pass to bake", items=( - ('COMBINED', "Combined", ""), - ('AO', "Ambient Occlusion", ""), - ('SHADOW', "Shadow", ""), - ('NORMAL', "Normal", ""), - ('UV', "UV", ""), - ('ROUGHNESS', "Roughness", ""), - ('EMIT', "Emit", ""), - ('ENVIRONMENT', "Environment", ""), - ('DIFFUSE', "Diffuse", ""), - ('GLOSSY', "Glossy", ""), - ('TRANSMISSION', "Transmission", ""), + ('COMBINED', "Combined", "", 0), + ('AO', "Ambient Occlusion", "", 1), + ('SHADOW', "Shadow", "", 2), + ('POSITION', "Position", "", 11), + ('NORMAL', "Normal", "", 3), + ('UV', "UV", "", 4), + ('ROUGHNESS', "Roughness", "", 5), + ('EMIT', "Emit", "", 6), + ('ENVIRONMENT', "Environment", "", 7), + ('DIFFUSE', "Diffuse", "", 8), + ('GLOSSY', "Glossy", "", 9), + ('TRANSMISSION', "Transmission", "", 10), ), ) @@ -827,6 +737,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): min=0, max=1024, ) + use_auto_tile: BoolProperty( + name="Auto Tiles", + description="Automatically split image into tiles", + default=True, + ) + tile_size: IntProperty( + name="Tile Size", + default=2048, + description="", + min=0, max=16384, + ) + # Various fine-tuning debug flags def _devices_update_callback(self, context): @@ -844,45 +766,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): items=enum_bvh_layouts, default='EMBREE', ) - debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False) debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False) - debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False) - - debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1) - debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False) - - debug_opencl_kernel_type: EnumProperty( - name="OpenCL Kernel Type", - default='DEFAULT', - items=( - ('DEFAULT', "Default", ""), - ('MEGA', "Mega", ""), - ('SPLIT', "Split", ""), - ), - update=CyclesRenderSettings._devices_update_callback - ) - debug_opencl_device_type: EnumProperty( - name="OpenCL Device Type", - default='ALL', - items=( - ('NONE', "None", ""), - ('ALL', "All", ""), - ('DEFAULT', "Default", ""), - ('CPU', "CPU", ""), - ('GPU', "GPU", ""), - ('ACCELERATOR', "Accelerator", ""), - ), - update=CyclesRenderSettings._devices_update_callback - ) - - debug_use_opencl_debug: BoolProperty(name="Debug OpenCL", default=False) - - debug_opencl_mem_limit: IntProperty( - name="Memory limit", - default=0, - description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)" + debug_use_optix_debug: BoolProperty( + name="OptiX Module Debug", + description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level", + default=False ) @classmethod @@ -1031,12 +921,6 @@ class CyclesLightSettings(bpy.types.PropertyGroup): description="Light casts shadows", default=True, ) - samples: IntProperty( - name="Samples", - description="Number of light samples to render for each AA sample", - min=1, max=10000, - default=1, - ) max_bounces: IntProperty( name="Max Bounces", description="Maximum number of bounces the light will contribute to the render", @@ -1084,12 +968,6 @@ class CyclesWorldSettings(bpy.types.PropertyGroup): min=4, max=8192, default=1024, ) - samples: IntProperty( - name="Samples", - description="Number of light samples to render for each AA sample", - min=1, max=10000, - default=1, - ) max_bounces: IntProperty( name="Max Bounces", description="Maximum number of bounces the background light will contribute to the render", @@ -1343,91 +1221,25 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): update=update_render_passes, ) + use_pass_shadow_catcher: BoolProperty( + name="Shadow Catcher", + description="Pass containing shadows and light which is to be multiplied into backdrop", + default=False, + update=update_render_passes, + ) + use_denoising: BoolProperty( name="Use Denoising", description="Denoise the rendered image", default=True, update=update_render_passes, ) - denoising_diffuse_direct: BoolProperty( - name="Diffuse Direct", - description="Denoise the direct diffuse lighting", - default=True, - ) - denoising_diffuse_indirect: BoolProperty( - name="Diffuse Indirect", - description="Denoise the indirect diffuse lighting", - default=True, - ) - denoising_glossy_direct: BoolProperty( - name="Glossy Direct", - description="Denoise the direct glossy lighting", - default=True, - ) - denoising_glossy_indirect: BoolProperty( - name="Glossy Indirect", - description="Denoise the indirect glossy lighting", - default=True, - ) - denoising_transmission_direct: BoolProperty( - name="Transmission Direct", - description="Denoise the direct transmission lighting", - default=True, - ) - denoising_transmission_indirect: BoolProperty( - name="Transmission Indirect", - description="Denoise the indirect transmission lighting", - default=True, - ) - denoising_strength: FloatProperty( - name="Denoising Strength", - description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)", - min=0.0, max=1.0, - default=0.5, - ) - denoising_feature_strength: FloatProperty( - name="Denoising Feature Strength", - description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)", - min=0.0, max=1.0, - default=0.5, - ) - denoising_radius: IntProperty( - name="Denoising Radius", - description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)", - min=1, max=25, - default=8, - subtype="PIXEL", - ) - denoising_relative_pca: BoolProperty( - name="Relative Filter", - description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)", - default=False, - ) denoising_store_passes: BoolProperty( name="Store Denoising Passes", description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering", default=False, update=update_render_passes, ) - denoising_neighbor_frames: IntProperty( - name="Neighbor Frames", - description="Number of neighboring frames to use for denoising animations (more frames produce smoother results at the cost of performance)", - min=0, max=7, - default=0, - ) - - denoising_optix_input_passes: EnumProperty( - name="Input Passes", - description="Passes used by the denoiser to distinguish noise from shader and geometry detail", - items=enum_denoising_input_passes, - default='RGB_ALBEDO', - ) - denoising_openimagedenoise_input_passes: EnumProperty( - name="Input Passes", - description="Passes used by the denoiser to distinguish noise from shader and geometry detail", - items=enum_denoising_input_passes, - default='RGB_ALBEDO_NORMAL', - ) @classmethod def register(cls): @@ -1454,14 +1266,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix, has_opencl = _cycles.get_device_types() + has_cuda, has_optix = _cycles.get_device_types() list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) if has_optix: list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) - if has_opencl: - list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2)) return list compute_device_type: EnumProperty( @@ -1486,7 +1296,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1520,22 +1330,23 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif entry.type == 'CPU': cpu_devices.append(entry) # Extend all GPU devices with CPU. - if compute_device_type in {'CUDA', 'OPTIX', 'OPENCL'}: + if compute_device_type != 'CPU': devices.extend(cpu_devices) return devices - # For backwards compatibility, only returns CUDA and OpenCL but still - # refreshes all devices. - def get_devices(self, compute_device_type=''): + # Refresh device list. This does not happen automatically on Blender + # startup due to unstable OpenCL implementations that can cause crashes. + def refresh_devices(self): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. for device_type in ('CUDA', 'OPTIX', 'OPENCL'): self.update_device_entries(_cycles.available_devices(device_type)) - cuda_devices = self.get_devices_for_type('CUDA') - opencl_devices = self.get_devices_for_type('OPENCL') - return cuda_devices, opencl_devices + # Deprecated: use refresh_devices instead. + def get_devices(self, compute_device_type=''): + self.refresh_devices() + return None def get_num_gpu_devices(self): import _cycles @@ -1601,6 +1412,10 @@ class CyclesView3DShadingSettings(bpy.types.PropertyGroup): items=enum_view3d_shading_render_pass, default='COMBINED', ) + show_active_pixels: BoolProperty( + name="Show Active Pixels", + description="When using adaptive sampling highlight pixels which are being sampled", + ) def register(): diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 47f7b4c6d73..d02627b9936 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -34,6 +34,12 @@ class CYCLES_PT_sampling_presets(PresetPanel, Panel): preset_add_operator = "render.cycles_sampling_preset_add" COMPAT_ENGINES = {'CYCLES'} +class CYCLES_PT_viewport_sampling_presets(PresetPanel, Panel): + bl_label = "Viewport Sampling Presets" + preset_subdir = "cycles/viewport_sampling" + preset_operator = "script.execute_preset" + preset_add_operator = "render.cycles_viewport_sampling_preset_add" + COMPAT_ENGINES = {'CYCLES'} class CYCLES_PT_integrator_presets(PresetPanel, Panel): bl_label = "Integrator Presets" @@ -54,6 +60,15 @@ class CyclesButtonsPanel: return context.engine in cls.COMPAT_ENGINES +class CyclesDebugButtonsPanel(CyclesButtonsPanel): + @classmethod + def poll(cls, context): + prefs = bpy.context.preferences + return (CyclesButtonsPanel.poll(context) + and prefs.experimental.use_cycles_debug + and prefs.view.show_developer_ui) + + # Adapt properties editor panel to display in node editor. We have to # copy the class rather than inherit due to the way bpy registration works. def node_panel(cls): @@ -78,12 +93,6 @@ def use_cpu(context): return (get_device_type(context) == 'NONE' or cscene.device == 'CPU') -def use_opencl(context): - cscene = context.scene.cycles - - return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU') - - def use_cuda(context): cscene = context.scene.cycles @@ -96,12 +105,6 @@ def use_optix(context): return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU') -def use_branched_path(context): - cscene = context.scene.cycles - - return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context)) - - def use_sample_all_lights(context): cscene = context.scene.cycles @@ -115,57 +118,33 @@ def show_device_active(context): return context.preferences.addons[__package__].preferences.has_active_device() -def draw_samples_info(layout, context): - cscene = context.scene.cycles - integrator = cscene.progressive +def get_effective_preview_denoiser(context): + scene = context.scene + cscene = scene.cycles + + if cscene.preview_denoiser != "AUTO": + return cscene.preview_denoiser + + if context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'): + return 'OPTIX' + + return 'OIDN' - # Calculate sample values - if integrator == 'PATH': - aa = cscene.samples - if cscene.use_square_samples: - aa = aa * aa - else: - aa = cscene.aa_samples - d = cscene.diffuse_samples - g = cscene.glossy_samples - t = cscene.transmission_samples - ao = cscene.ao_samples - ml = cscene.mesh_light_samples - sss = cscene.subsurface_samples - vol = cscene.volume_samples - - if cscene.use_square_samples: - aa = aa * aa - d = d * d - g = g * g - t = t * t - ao = ao * ao - ml = ml * ml - sss = sss * sss - vol = vol * vol - - # Draw interface - # Do not draw for progressive, when Square Samples are disabled - if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'): - col = layout.column(align=True) - col.scale_y = 0.6 - col.label(text="Total Samples:") - col.separator() - if integrator == 'PATH': - col.label(text="%s AA" % aa) - else: - col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" % - (aa, d * aa, g * aa, t * aa)) - col.separator() - col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" % - (ao * aa, ml * aa, sss * aa, vol * aa)) class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): bl_label = "Sampling" + def draw(self, context): + pass + + +class CYCLES_RENDER_PT_sampling_viewport(CyclesButtonsPanel, Panel): + bl_label = "Viewport" + bl_parent_id = "CYCLES_RENDER_PT_sampling" + def draw_header_preset(self, context): - CYCLES_PT_sampling_presets.draw_panel_header(self.layout) + CYCLES_PT_viewport_sampling_presets.draw_panel_header(self.layout) def draw(self, context): layout = self.layout @@ -176,29 +155,31 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): layout.use_property_split = True layout.use_property_decorate = False - if not use_optix(context): - layout.prop(cscene, "progressive") + heading = layout.column(align=True, heading="Noise Threshold") + row = heading.row(align=True) + row.prop(cscene, "use_preview_adaptive_sampling", text="") + sub = row.row() + sub.active = cscene.use_preview_adaptive_sampling + sub.prop(cscene, "preview_adaptive_threshold", text="") - if not use_branched_path(context): + if cscene.use_preview_adaptive_sampling: col = layout.column(align=True) - col.prop(cscene, "samples", text="Render") - col.prop(cscene, "preview_samples", text="Viewport") + col.prop(cscene, "preview_samples", text=" Max Samples") + col.prop(cscene, "preview_adaptive_min_samples", text="Min Samples") else: - col = layout.column(align=True) - col.prop(cscene, "aa_samples", text="Render") - col.prop(cscene, "preview_aa_samples", text="Viewport") + layout.prop(cscene, "preview_samples", text="Samples") - if not use_branched_path(context): - draw_samples_info(layout, context) +class CYCLES_RENDER_PT_sampling_viewport_denoise(CyclesButtonsPanel, Panel): + bl_label = "Denoise" + bl_parent_id = 'CYCLES_RENDER_PT_sampling_viewport' + bl_options = {'DEFAULT_CLOSED'} -class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel): - bl_label = "Sub Samples" - bl_parent_id = "CYCLES_RENDER_PT_sampling" + def draw_header(self, context): + scene = context.scene + cscene = scene.cycles - @classmethod - def poll(cls, context): - return use_branched_path(context) + self.layout.prop(context.scene.cycles, "use_preview_denoising", text="") def draw(self, context): layout = self.layout @@ -208,53 +189,61 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel): scene = context.scene cscene = scene.cycles - col = layout.column(align=True) - col.prop(cscene, "diffuse_samples", text="Diffuse") - col.prop(cscene, "glossy_samples", text="Glossy") - col.prop(cscene, "transmission_samples", text="Transmission") - col.prop(cscene, "ao_samples", text="AO") + col = layout.column() + col.active = cscene.use_preview_denoising + col.prop(cscene, "preview_denoiser", text="Denoiser") + col.prop(cscene, "preview_denoising_input_passes", text="Passes") - sub = col.row(align=True) - sub.active = use_sample_all_lights(context) - sub.prop(cscene, "mesh_light_samples", text="Mesh Light") - col.prop(cscene, "subsurface_samples", text="Subsurface") - col.prop(cscene, "volume_samples", text="Volume") + effective_preview_denoiser = get_effective_preview_denoiser(context) + if effective_preview_denoiser == 'OPENIMAGEDENOISE': + col.prop(cscene, "preview_denoising_prefilter", text="Prefilter") - draw_samples_info(layout, context) + col.prop(cscene, "preview_denoising_start_sample", text="Start Sample") -class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel): - bl_label = "Adaptive Sampling" +class CYCLES_RENDER_PT_sampling_render(CyclesButtonsPanel, Panel): + bl_label = "Render" bl_parent_id = "CYCLES_RENDER_PT_sampling" - bl_options = {'DEFAULT_CLOSED'} - def draw_header(self, context): - layout = self.layout - scene = context.scene - cscene = scene.cycles - - layout.prop(cscene, "use_adaptive_sampling", text="") + def draw_header_preset(self, context): + CYCLES_PT_sampling_presets.draw_panel_header(self.layout) def draw(self, context): layout = self.layout - layout.use_property_split = True - layout.use_property_decorate = False scene = context.scene cscene = scene.cycles - layout.active = cscene.use_adaptive_sampling + layout.use_property_split = True + layout.use_property_decorate = False + + heading = layout.column(align=True, heading="Noise Threshold") + row = heading.row(align=True) + row.prop(cscene, "use_adaptive_sampling", text="") + sub = row.row() + sub.active = cscene.use_adaptive_sampling + sub.prop(cscene, "adaptive_threshold", text="") col = layout.column(align=True) - col.prop(cscene, "adaptive_threshold", text="Noise Threshold") - col.prop(cscene, "adaptive_min_samples", text="Min Samples") + if cscene.use_adaptive_sampling: + col.prop(cscene, "samples", text=" Max Samples") + col.prop(cscene, "adaptive_min_samples", text="Min Samples") + else: + col.prop(cscene, "samples", text="Samples") + col.prop(cscene, "time_limit") -class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel): - bl_label = "Denoising" - bl_parent_id = "CYCLES_RENDER_PT_sampling" +class CYCLES_RENDER_PT_sampling_render_denoise(CyclesButtonsPanel, Panel): + bl_label = "Denoise" + bl_parent_id = 'CYCLES_RENDER_PT_sampling_render' bl_options = {'DEFAULT_CLOSED'} + def draw_header(self, context): + scene = context.scene + cscene = scene.cycles + + self.layout.prop(context.scene.cycles, "use_denoising", text="") + def draw(self, context): layout = self.layout layout.use_property_split = True @@ -263,33 +252,12 @@ class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel): scene = context.scene cscene = scene.cycles - heading = layout.column(align=True, heading="Render") - row = heading.row(align=True) - row.prop(cscene, "use_denoising", text="") - sub = row.row() - - sub.active = cscene.use_denoising - for view_layer in scene.view_layers: - if view_layer.cycles.denoising_store_passes: - sub.active = True - - sub.prop(cscene, "denoiser", text="") - - layout.separator() - - heading = layout.column(align=False, heading="Viewport") - row = heading.row(align=True) - row.prop(cscene, "use_preview_denoising", text="") - sub = row.row() - sub.active = cscene.use_preview_denoising - sub.prop(cscene, "preview_denoiser", text="") - - sub = heading.row(align=True) - sub.active = cscene.use_preview_denoising - sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample") - sub = heading.row(align=True) - sub.active = cscene.use_preview_denoising - sub.prop(cscene, "preview_denoising_input_passes", text="Input Passes") + col = layout.column() + col.active = cscene.use_denoising + col.prop(cscene, "denoiser", text="Denoiser") + col.prop(cscene, "denoising_input_passes", text="Passes") + if cscene.denoiser == 'OPENIMAGEDENOISE': + col.prop(cscene, "denoising_prefilter", text="Prefilter") class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): @@ -313,8 +281,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): col.active = not(cscene.use_adaptive_sampling) col.prop(cscene, "sampling_pattern", text="Pattern") - layout.prop(cscene, "use_square_samples") - layout.separator() col = layout.column(align=True) @@ -322,11 +288,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): col.prop(cscene, "min_transparent_bounces") col.prop(cscene, "light_sampling_threshold", text="Light Threshold") - if cscene.progressive != 'PATH' and use_branched_path(context): - col = layout.column(align=True) - col.prop(cscene, "sample_all_lights_direct") - col.prop(cscene, "sample_all_lights_indirect") - for view_layer in scene.view_layers: if view_layer.samples > 0: layout.separator() @@ -334,62 +295,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): break -class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel): - bl_label = "Total Samples" - bl_parent_id = "CYCLES_RENDER_PT_sampling" - - @classmethod - def poll(cls, context): - scene = context.scene - cscene = scene.cycles - - if cscene.use_square_samples: - return True - - return cscene.progressive != 'PATH' and use_branched_path(context) - - def draw(self, context): - layout = self.layout - cscene = context.scene.cycles - integrator = cscene.progressive - - # Calculate sample values - if integrator == 'PATH': - aa = cscene.samples - if cscene.use_square_samples: - aa = aa * aa - else: - aa = cscene.aa_samples - d = cscene.diffuse_samples - g = cscene.glossy_samples - t = cscene.transmission_samples - ao = cscene.ao_samples - ml = cscene.mesh_light_samples - sss = cscene.subsurface_samples - vol = cscene.volume_samples - - if cscene.use_square_samples: - aa = aa * aa - d = d * d - g = g * g - t = t * t - ao = ao * ao - ml = ml * ml - sss = sss * sss - vol = vol * vol - - col = layout.column(align=True) - col.scale_y = 0.6 - if integrator == 'PATH': - col.label(text="%s AA" % aa) - else: - col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" % - (aa, d * aa, g * aa, t * aa)) - col.separator() - col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" % - (ao * aa, ml * aa, sss * aa, vol * aa)) - - class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel): bl_label = "Subdivision" bl_options = {'DEFAULT_CLOSED'} @@ -548,6 +453,8 @@ class CYCLES_RENDER_PT_light_paths_fast_gi(CyclesButtonsPanel, Panel): layout.use_property_split = True layout.use_property_decorate = False + layout.active = cscene.use_fast_gi + col = layout.column(align=True) col.prop(cscene, "ao_bounces", text="Viewport Bounces") col.prop(cscene, "ao_bounces_render", text="Render Bounces") @@ -716,19 +623,13 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel): layout.use_property_decorate = False scene = context.scene - rd = scene.render cscene = scene.cycles col = layout.column() - - sub = col.column(align=True) - sub.prop(rd, "tile_x", text="Tiles X") - sub.prop(rd, "tile_y", text="Y") - col.prop(cscene, "tile_order", text="Order") - + col.prop(cscene, "use_auto_tile") sub = col.column() - sub.active = not rd.use_save_buffers and not cscene.use_adaptive_sampling - sub.prop(cscene, "use_progressive_refine") + sub.active = cscene.use_auto_tile + sub.prop(cscene, "tile_size") class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Panel): @@ -778,7 +679,6 @@ class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel): col = layout.column() - col.prop(rd, "use_save_buffers") col.prop(rd, "use_persistent_data", text="Persistent Data") @@ -797,7 +697,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel): col = layout.column() col.prop(rd, "preview_pixel_size", text="Pixel Size") - col.prop(cscene, "preview_start_resolution", text="Start Pixels") class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel): @@ -818,7 +717,6 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel): col = layout.column(heading="Include") col.prop(view_layer, "use_sky", text="Environment") - col.prop(view_layer, "use_ao", text="Ambient Occlusion") col.prop(view_layer, "use_solid", text="Surfaces") col.prop(view_layer, "use_strand", text="Hair") col.prop(view_layer, "use_volumes", text="Volumes") @@ -827,6 +725,9 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel): sub = col.row() sub.prop(view_layer, "use_motion_blur", text="Motion Blur") sub.active = rd.use_motion_blur + sub = col.row() + sub.prop(view_layer.cycles, 'use_denoising', text='Denoising') + sub.active = scene.cycles.use_denoising class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel): @@ -872,6 +773,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel): col.prop(view_layer, "use_pass_combined") col.prop(view_layer, "use_pass_z") col.prop(view_layer, "use_pass_mist") + col.prop(view_layer, "use_pass_position") col.prop(view_layer, "use_pass_normal") sub = col.column() sub.active = not rd.use_motion_blur @@ -928,6 +830,7 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel): col.prop(view_layer, "use_pass_environment") col.prop(view_layer, "use_pass_shadow") col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion") + col.prop(cycles_view_layer, "use_pass_shadow_catcher") class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, ViewLayerCryptomattePanel, Panel): @@ -942,70 +845,6 @@ class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, ViewLayerAOVPanel): bl_parent_id = "CYCLES_RENDER_PT_passes" -class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): - bl_label = "Denoising" - bl_context = "view_layer" - bl_options = {'DEFAULT_CLOSED'} - - @classmethod - def poll(cls, context): - cscene = context.scene.cycles - return CyclesButtonsPanel.poll(context) and cscene.use_denoising - - def draw_header(self, context): - scene = context.scene - view_layer = context.view_layer - cycles_view_layer = view_layer.cycles - - layout = self.layout - layout.prop(cycles_view_layer, "use_denoising", text="") - - def draw(self, context): - layout = self.layout - layout.use_property_split = True - layout.use_property_decorate = False - - scene = context.scene - view_layer = context.view_layer - cycles_view_layer = view_layer.cycles - denoiser = scene.cycles.denoiser - - layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising - - col = layout.column() - - if denoiser == 'OPTIX': - col.prop(cycles_view_layer, "denoising_optix_input_passes") - return - elif denoiser == 'OPENIMAGEDENOISE': - col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes") - return - - col.prop(cycles_view_layer, "denoising_radius", text="Radius") - - col = layout.column() - col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength") - col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength") - col.prop(cycles_view_layer, "denoising_relative_pca") - - layout.separator() - - col = layout.column() - col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes - - row = col.row(heading="Diffuse", align=True) - row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True) - row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True) - - row = col.row(heading="Glossy", align=True) - row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True) - row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True) - - row = col.row(heading="Transmission", align=True) - row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True) - row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True) - - class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel): bl_label = "Post Processing" bl_options = {'DEFAULT_CLOSED'} @@ -1417,10 +1256,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel): if not (light.type == 'AREA' and clamp.is_portal): sub = col.column() - if use_branched_path(context): - subsub = sub.row(align=True) - subsub.active = use_sample_all_lights(context) - subsub.prop(clamp, "samples") sub.prop(clamp, "max_bounces") sub = col.column(align=True) @@ -1526,34 +1361,6 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel): panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume') -class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel): - bl_label = "Ambient Occlusion" - bl_context = "world" - bl_options = {'DEFAULT_CLOSED'} - - @classmethod - def poll(cls, context): - return context.world and CyclesButtonsPanel.poll(context) - - def draw_header(self, context): - light = context.world.light_settings - self.layout.prop(light, "use_ambient_occlusion", text="") - - def draw(self, context): - layout = self.layout - layout.use_property_split = True - layout.use_property_decorate = False - - light = context.world.light_settings - scene = context.scene - - col = layout.column() - sub = col.column() - sub.active = light.use_ambient_occlusion or scene.render.use_simplify - sub.prop(light, "ao_factor", text="Factor") - col.prop(light, "distance", text="Distance") - - class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel): bl_label = "Mist Pass" bl_context = "world" @@ -1650,10 +1457,6 @@ class CYCLES_WORLD_PT_settings_surface(CyclesButtonsPanel, Panel): subsub = sub.row(align=True) subsub.active = cworld.sampling_method == 'MANUAL' subsub.prop(cworld, "sample_map_resolution") - if use_branched_path(context): - subsub = sub.column(align=True) - subsub.active = use_sample_all_lights(context) - subsub.prop(cworld, "samples") sub.prop(cworld, "max_bounces") @@ -1677,8 +1480,7 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel): col = layout.column() sub = col.column() - sub.active = use_cpu(context) - sub.prop(cworld, "volume_sampling", text="Sampling") + col.prop(cworld, "volume_sampling", text="Sampling") col.prop(cworld, "volume_interpolation", text="Interpolation") col.prop(cworld, "homogeneous_volume", text="Homogeneous") sub = col.column() @@ -1817,8 +1619,7 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel): col = layout.column() sub = col.column() - sub.active = use_cpu(context) - sub.prop(cmat, "volume_sampling", text="Sampling") + col.prop(cmat, "volume_sampling", text="Sampling") col.prop(cmat, "volume_interpolation", text="Interpolation") col.prop(cmat, "homogeneous_volume", text="Homogeneous") sub = col.column() @@ -1845,9 +1646,6 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel): cbk = scene.render.bake rd = scene.render - if use_optix(context): - layout.label(text="Baking is performed using CUDA instead of OptiX", icon='INFO') - if rd.use_bake_multires: layout.operator("object.bake_image", icon='RENDER_STILL') layout.prop(rd, "use_bake_multires") @@ -1905,7 +1703,6 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel): col.prop(cbk, "use_pass_diffuse") col.prop(cbk, "use_pass_glossy") col.prop(cbk, "use_pass_transmission") - col.prop(cbk, "use_pass_ambient_occlusion") col.prop(cbk, "use_pass_emit") elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}: @@ -1989,19 +1786,12 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel): layout.prop(cbk, "use_clear", text="Clear Image") -class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel): bl_label = "Debug" bl_context = "render" bl_options = {'DEFAULT_CLOSED'} COMPAT_ENGINES = {'CYCLES'} - @classmethod - def poll(cls, context): - prefs = bpy.context.preferences - return (CyclesButtonsPanel.poll(context) - and prefs.experimental.use_cycles_debug - and prefs.view.show_developer_ui) - def draw(self, context): layout = self.layout @@ -2018,29 +1808,18 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_bvh_layout") - col.prop(cscene, "debug_use_cpu_split_kernel") col.separator() col = layout.column() col.label(text="CUDA Flags:") col.prop(cscene, "debug_use_cuda_adaptive_compile") - col.prop(cscene, "debug_use_cuda_split_kernel") col.separator() col = layout.column() col.label(text="OptiX Flags:") - col.prop(cscene, "debug_optix_cuda_streams") - col.prop(cscene, "debug_optix_curves_api") - - col.separator() - - col = layout.column() - col.label(text="OpenCL Flags:") - col.prop(cscene, "debug_opencl_device_type", text="Device") - col.prop(cscene, "debug_use_opencl_debug", text="Debug") - col.prop(cscene, "debug_opencl_mem_limit") + col.prop(cscene, "debug_use_optix_debug") col.separator() @@ -2141,20 +1920,22 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel): sub.prop(cscene, "distance_cull_margin", text="") -class CYCLES_VIEW3D_PT_shading_render_pass(Panel): +class CyclesShadingButtonsPanel(CyclesButtonsPanel): bl_space_type = 'VIEW_3D' bl_region_type = 'HEADER' - bl_label = "Render Pass" bl_parent_id = 'VIEW3D_PT_shading' - COMPAT_ENGINES = {'CYCLES'} @classmethod def poll(cls, context): return ( - context.engine in cls.COMPAT_ENGINES and + CyclesButtonsPanel.poll(context) and context.space_data.shading.type == 'RENDERED' ) + +class CYCLES_VIEW3D_PT_shading_render_pass(CyclesShadingButtonsPanel, Panel): + bl_label = "Render Pass" + def draw(self, context): shading = context.space_data.shading @@ -2162,6 +1943,26 @@ class CYCLES_VIEW3D_PT_shading_render_pass(Panel): layout.prop(shading.cycles, "render_pass", text="") +class CYCLES_VIEW3D_PT_shading_debug(CyclesDebugButtonsPanel, + CyclesShadingButtonsPanel, + Panel): + bl_label = "Debug" + + @classmethod + def poll(cls, context): + return ( + CyclesDebugButtonsPanel.poll(context) and + CyclesShadingButtonsPanel.poll(context) + ) + + def draw(self, context): + shading = context.space_data.shading + + layout = self.layout + layout.active = context.scene.cycles.use_preview_adaptive_sampling + layout.prop(shading.cycles, "show_active_pixels") + + class CYCLES_VIEW3D_PT_shading_lighting(Panel): bl_space_type = 'VIEW_3D' bl_region_type = 'HEADER' @@ -2275,11 +2076,13 @@ def get_panels(): classes = ( CYCLES_PT_sampling_presets, + CYCLES_PT_viewport_sampling_presets, CYCLES_PT_integrator_presets, CYCLES_RENDER_PT_sampling, - CYCLES_RENDER_PT_sampling_sub_samples, - CYCLES_RENDER_PT_sampling_adaptive, - CYCLES_RENDER_PT_sampling_denoising, + CYCLES_RENDER_PT_sampling_viewport, + CYCLES_RENDER_PT_sampling_viewport_denoise, + CYCLES_RENDER_PT_sampling_render, + CYCLES_RENDER_PT_sampling_render_denoise, CYCLES_RENDER_PT_sampling_advanced, CYCLES_RENDER_PT_light_paths, CYCLES_RENDER_PT_light_paths_max_bounces, @@ -2296,6 +2099,7 @@ classes = ( CYCLES_VIEW3D_PT_simplify_greasepencil, CYCLES_VIEW3D_PT_shading_lighting, CYCLES_VIEW3D_PT_shading_render_pass, + CYCLES_VIEW3D_PT_shading_debug, CYCLES_RENDER_PT_motion_blur, CYCLES_RENDER_PT_motion_blur_curve, CYCLES_RENDER_PT_film, @@ -2314,7 +2118,6 @@ classes = ( CYCLES_RENDER_PT_passes_aov, CYCLES_RENDER_PT_filter, CYCLES_RENDER_PT_override, - CYCLES_RENDER_PT_denoising, CYCLES_PT_post_processing, CYCLES_CAMERA_PT_dof, CYCLES_CAMERA_PT_dof_aperture, @@ -2333,7 +2136,6 @@ classes = ( CYCLES_WORLD_PT_preview, CYCLES_WORLD_PT_surface, CYCLES_WORLD_PT_volume, - CYCLES_WORLD_PT_ambient_occlusion, CYCLES_WORLD_PT_mist, CYCLES_WORLD_PT_ray_visibility, CYCLES_WORLD_PT_settings, diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py index 827f84b9873..57da7d7995c 100644 --- a/intern/cycles/blender/addon/version_update.py +++ b/intern/cycles/blender/addon/version_update.py @@ -109,7 +109,7 @@ def do_versions(self): library_versions.setdefault(library.version, []).append(library) # Do versioning per library, since they might have different versions. - max_need_versioning = (2, 93, 7) + max_need_versioning = (3, 0, 25) for version, libraries in library_versions.items(): if version > max_need_versioning: continue @@ -166,10 +166,6 @@ def do_versions(self): if not cscene.is_property_set("filter_type"): cscene.pixel_filter_type = 'GAUSSIAN' - # Tile Order - if not cscene.is_property_set("tile_order"): - cscene.tile_order = 'CENTER' - if version <= (2, 76, 10): cscene = scene.cycles if cscene.is_property_set("filter_type"): @@ -186,10 +182,6 @@ def do_versions(self): if version <= (2, 79, 0): cscene = scene.cycles # Default changes - if not cscene.is_property_set("aa_samples"): - cscene.aa_samples = 4 - if not cscene.is_property_set("preview_aa_samples"): - cscene.preview_aa_samples = 4 if not cscene.is_property_set("blur_glossy"): cscene.blur_glossy = 0.0 if not cscene.is_property_set("sample_clamp_indirect"): @@ -203,7 +195,6 @@ def do_versions(self): view_layer.use_pass_cryptomatte_material = cview_layer.get("use_pass_crypto_material", False) view_layer.use_pass_cryptomatte_asset = cview_layer.get("use_pass_crypto_asset", False) view_layer.pass_cryptomatte_depth = cview_layer.get("pass_crypto_depth", 6) - view_layer.use_pass_cryptomatte_accurate = cview_layer.get("pass_crypto_accurate", True) if version <= (2, 93, 7): if scene.render.engine == 'CYCLES': @@ -229,6 +220,35 @@ def do_versions(self): cscene.ao_bounces = 1 cscene.ao_bounces_render = 1 + if version <= (3, 0, 25): + cscene = scene.cycles + + # Default changes. + if not cscene.is_property_set("samples"): + cscene.samples = 128 + if not cscene.is_property_set("preview_samples"): + cscene.preview_samples = 32 + if not cscene.is_property_set("use_adaptive_sampling"): + cscene.use_adaptive_sampling = False + cscene.use_preview_adaptive_sampling = False + if not cscene.is_property_set("use_denoising"): + cscene.use_denoising = False + if not cscene.is_property_set("use_preview_denoising"): + cscene.use_preview_denoising = False + if not cscene.is_property_set("sampling_pattern"): + cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER' + + # Removal of square samples. + cscene = scene.cycles + use_square_samples = cscene.get("use_square_samples", False) + + if use_square_samples: + cscene.samples *= cscene.samples + cscene.preview_samples *= cscene.preview_samples + for layer in scene.view_layers: + layer.samples *= layer.samples + cscene["use_square_samples"] = False + # Lamps for light in bpy.data.lights: if light.library not in libraries: @@ -249,10 +269,6 @@ def do_versions(self): if version <= (2, 76, 9): cworld = world.cycles - # World MIS Samples - if not cworld.is_property_set("samples"): - cworld.samples = 4 - # World MIS Resolution if not cworld.is_property_set("sample_map_resolution"): cworld.sample_map_resolution = 256 diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index 6954c5c2f26..4e8df5a99a6 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -894,12 +894,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d, } } -BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d, - BL::RegionView3D &b_rv3d, - Camera *cam, - int width, - int height, - const bool use_denoiser) +BufferParams BlenderSync::get_buffer_params( + BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height) { BufferParams params; bool use_border = false; @@ -931,11 +927,6 @@ BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d, params.height = height; } - PassType display_pass = update_viewport_display_passes(b_v3d, params.passes); - - /* Can only denoise the combined image pass */ - params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser; - return params; } diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index d51b31de638..ce1770f18a3 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -25,7 +25,6 @@ CCL_NAMESPACE_BEGIN enum ComputeDevice { COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, - COMPUTE_DEVICE_OPENCL = 2, COMPUTE_DEVICE_OPTIX = 3, COMPUTE_DEVICE_NUM @@ -68,13 +67,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen device = Device::get_multi_device(devices, threads, background); } } - else if (get_enum(cscene, "device") == 2) { - /* Find network device. */ - vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK); - if (!devices.empty()) { - device = devices.front(); - } - } else if (get_enum(cscene, "device") == 1) { /* Test if we are using GPU devices. */ ComputeDevice compute_device = (ComputeDevice)get_enum( @@ -89,9 +81,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_OPTIX) { mask |= DEVICE_MASK_OPTIX; } - else if (compute_device == COMPUTE_DEVICE_OPENCL) { - mask |= DEVICE_MASK_OPENCL; - } vector<DeviceInfo> devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp new file mode 100644 index 00000000000..a79232af71f --- /dev/null +++ b/intern/cycles/blender/blender_gpu_display.cpp @@ -0,0 +1,761 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "blender/blender_gpu_display.h" + +#include "device/device.h" +#include "util/util_logging.h" +#include "util/util_opengl.h" + +extern "C" { +struct RenderEngine; + +bool RE_engine_has_render_context(struct RenderEngine *engine); +void RE_engine_render_context_enable(struct RenderEngine *engine); +void RE_engine_render_context_disable(struct RenderEngine *engine); + +bool DRW_opengl_context_release(); +void DRW_opengl_context_activate(bool drw_state); + +void *WM_opengl_context_create(); +void WM_opengl_context_activate(void *gl_context); +void WM_opengl_context_dispose(void *gl_context); +void WM_opengl_context_release(void *context); +} + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * BlenderDisplayShader. + */ + +unique_ptr<BlenderDisplayShader> BlenderDisplayShader::create(BL::RenderEngine &b_engine, + BL::Scene &b_scene) +{ + if (b_engine.support_display_space_shader(b_scene)) { + return make_unique<BlenderDisplaySpaceShader>(b_engine, b_scene); + } + + return make_unique<BlenderFallbackDisplayShader>(); +} + +int BlenderDisplayShader::get_position_attrib_location() +{ + if (position_attribute_location_ == -1) { + const uint shader_program = get_shader_program(); + position_attribute_location_ = glGetAttribLocation(shader_program, position_attribute_name); + } + return position_attribute_location_; +} + +int BlenderDisplayShader::get_tex_coord_attrib_location() +{ + if (tex_coord_attribute_location_ == -1) { + const uint shader_program = get_shader_program(); + tex_coord_attribute_location_ = glGetAttribLocation(shader_program, tex_coord_attribute_name); + } + return tex_coord_attribute_location_; +} + +/* -------------------------------------------------------------------- + * BlenderFallbackDisplayShader. + */ + +/* TODO move shaders to standalone .glsl file. */ +static const char *FALLBACK_VERTEX_SHADER = + "#version 330\n" + "uniform vec2 fullscreen;\n" + "in vec2 texCoord;\n" + "in vec2 pos;\n" + "out vec2 texCoord_interp;\n" + "\n" + "vec2 normalize_coordinates()\n" + "{\n" + " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n" + " texCoord_interp = texCoord;\n" + "}\n\0"; + +static const char *FALLBACK_FRAGMENT_SHADER = + "#version 330\n" + "uniform sampler2D image_texture;\n" + "in vec2 texCoord_interp;\n" + "out vec4 fragColor;\n" + "\n" + "void main()\n" + "{\n" + " fragColor = texture(image_texture, texCoord_interp);\n" + "}\n\0"; + +static void shader_print_errors(const char *task, const char *log, const char *code) +{ + LOG(ERROR) << "Shader: " << task << " error:"; + LOG(ERROR) << "===== shader string ===="; + + stringstream stream(code); + string partial; + + int line = 1; + while (getline(stream, partial, '\n')) { + if (line < 10) { + LOG(ERROR) << " " << line << " " << partial; + } + else { + LOG(ERROR) << line << " " << partial; + } + line++; + } + LOG(ERROR) << log; +} + +static int compile_fallback_shader(void) +{ + const struct Shader { + const char *source; + const GLenum type; + } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER}, + {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}}; + + const GLuint program = glCreateProgram(); + + for (int i = 0; i < 2; i++) { + const GLuint shader = glCreateShader(shaders[i].type); + + string source_str = shaders[i].source; + const char *c_str = source_str.c_str(); + + glShaderSource(shader, 1, &c_str, NULL); + glCompileShader(shader); + + GLint compile_status; + glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status); + + if (!compile_status) { + GLchar log[5000]; + GLsizei length = 0; + glGetShaderInfoLog(shader, sizeof(log), &length, log); + shader_print_errors("compile", log, c_str); + return 0; + } + + glAttachShader(program, shader); + } + + /* Link output. */ + glBindFragDataLocation(program, 0, "fragColor"); + + /* Link and error check. */ + glLinkProgram(program); + + /* TODO(sergey): Find a way to nicely de-duplicate the error checking. */ + GLint link_status; + glGetProgramiv(program, GL_LINK_STATUS, &link_status); + if (!link_status) { + GLchar log[5000]; + GLsizei length = 0; + /* TODO(sergey): Is it really program passed to glGetShaderInfoLog? */ + glGetShaderInfoLog(program, sizeof(log), &length, log); + shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER); + shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER); + return 0; + } + + return program; +} + +void BlenderFallbackDisplayShader::bind(int width, int height) +{ + create_shader_if_needed(); + + if (!shader_program_) { + return; + } + + glUseProgram(shader_program_); + glUniform1i(image_texture_location_, 0); + glUniform2f(fullscreen_location_, width, height); +} + +void BlenderFallbackDisplayShader::unbind() +{ +} + +uint BlenderFallbackDisplayShader::get_shader_program() +{ + return shader_program_; +} + +void BlenderFallbackDisplayShader::create_shader_if_needed() +{ + if (shader_program_ || shader_compile_attempted_) { + return; + } + + shader_compile_attempted_ = true; + + shader_program_ = compile_fallback_shader(); + if (!shader_program_) { + return; + } + + glUseProgram(shader_program_); + + image_texture_location_ = glGetUniformLocation(shader_program_, "image_texture"); + if (image_texture_location_ < 0) { + LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform."; + destroy_shader(); + return; + } + + fullscreen_location_ = glGetUniformLocation(shader_program_, "fullscreen"); + if (fullscreen_location_ < 0) { + LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform."; + destroy_shader(); + return; + } +} + +void BlenderFallbackDisplayShader::destroy_shader() +{ + glDeleteProgram(shader_program_); + shader_program_ = 0; +} + +/* -------------------------------------------------------------------- + * BlenderDisplaySpaceShader. + */ + +BlenderDisplaySpaceShader::BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, + BL::Scene &b_scene) + : b_engine_(b_engine), b_scene_(b_scene) +{ + DCHECK(b_engine_.support_display_space_shader(b_scene_)); +} + +void BlenderDisplaySpaceShader::bind(int /*width*/, int /*height*/) +{ + b_engine_.bind_display_space_shader(b_scene_); +} + +void BlenderDisplaySpaceShader::unbind() +{ + b_engine_.unbind_display_space_shader(); +} + +uint BlenderDisplaySpaceShader::get_shader_program() +{ + if (!shader_program_) { + glGetIntegerv(GL_CURRENT_PROGRAM, reinterpret_cast<int *>(&shader_program_)); + } + + if (!shader_program_) { + LOG(ERROR) << "Error retrieving shader program for display space shader."; + } + + return shader_program_; +} + +/* -------------------------------------------------------------------- + * BlenderGPUDisplay. + */ + +BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene) + : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene)) +{ + /* Create context while on the main thread. */ + gl_context_create(); +} + +BlenderGPUDisplay::~BlenderGPUDisplay() +{ + gl_resources_destroy(); +} + +/* -------------------------------------------------------------------- + * Update procedure. + */ + +bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams ¶ms, + int texture_width, + int texture_height) +{ + /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing + * the texture does not happen at the same time. This is achieved indirectly. + * + * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock. + * This same lock is also held when do_draw() is called, which together ensure mutual + * exclusion. + * + * This locking is not performed at the GPU display level, because that would cause lock + * inversion. */ + if (!gl_context_enable()) { + return false; + } + + if (gl_render_sync_) { + glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED); + } + + if (!gl_texture_resources_ensure()) { + gl_context_disable(); + return false; + } + + /* Update texture dimensions if needed. */ + if (texture_.width != texture_width || texture_.height != texture_height) { + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, texture_.gl_id); + glTexImage2D( + GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0); + texture_.width = texture_width; + texture_.height = texture_height; + glBindTexture(GL_TEXTURE_2D, 0); + + /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to + * avoid undefined content. */ + texture_.need_clear = true; + } + + /* Update PBO dimensions if needed. + * + * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in, + * at a resolution divider 1. This was we don't need to recreate graphics interoperability + * objects which are costly and which are tied to the specific underlying buffer size. + * The downside of this approach is that when graphics interopeability is not used we are sending + * too much data to GPU when resolution divider is not 1. */ + /* TODO(sergey): Investigate whether keeping the PBO exact size of the texute makes non-interop + * mode faster. */ + const int buffer_width = params.full_size.x; + const int buffer_height = params.full_size.y; + if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) { + const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height; + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id); + glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + texture_.buffer_width = buffer_width; + texture_.buffer_height = buffer_height; + } + + /* New content will be provided to the texture in one way or another, so mark this in a + * centralized place. */ + texture_.need_update = true; + + return true; +} + +void BlenderGPUDisplay::do_update_end() +{ + gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + glFlush(); + + gl_context_disable(); +} + +/* -------------------------------------------------------------------- + * Texture update from CPU buffer. + */ + +void BlenderGPUDisplay::do_copy_pixels_to_texture( + const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height) +{ + /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time + * point of view than to copy data directly to the OpenGL texture. + * + * The possible downside of this approach is that it might require a higher peak memory when + * doing partial updates of the texture (although, in practice even partial updates might peak + * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */ + + half4 *mapped_rgba_pixels = map_texture_buffer(); + if (!mapped_rgba_pixels) { + return; + } + + if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width && + pixels_height == texture_.height) { + const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height; + memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes); + } + else { + const half4 *rgba_row = rgba_pixels; + half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x; + for (int y = 0; y < pixels_height; + ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) { + memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width); + } + } + + unmap_texture_buffer(); +} + +/* -------------------------------------------------------------------- + * Texture buffer mapping. + */ + +half4 *BlenderGPUDisplay::do_map_texture_buffer() +{ + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id); + + half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>( + glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY)); + if (!mapped_rgba_pixels) { + LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object."; + } + + if (texture_.need_clear) { + const int64_t texture_width = texture_.width; + const int64_t texture_height = texture_.height; + memset(reinterpret_cast<void *>(mapped_rgba_pixels), + 0, + texture_width * texture_height * sizeof(half4)); + texture_.need_clear = false; + } + + return mapped_rgba_pixels; +} + +void BlenderGPUDisplay::do_unmap_texture_buffer() +{ + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); +} + +/* -------------------------------------------------------------------- + * Graphics interoperability. + */ + +DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get() +{ + DeviceGraphicsInteropDestination interop_dst; + + interop_dst.buffer_width = texture_.buffer_width; + interop_dst.buffer_height = texture_.buffer_height; + interop_dst.opengl_pbo_id = texture_.gl_pbo_id; + + interop_dst.need_clear = texture_.need_clear; + texture_.need_clear = false; + + return interop_dst; +} + +void BlenderGPUDisplay::graphics_interop_activate() +{ + gl_context_enable(); +} + +void BlenderGPUDisplay::graphics_interop_deactivate() +{ + gl_context_disable(); +} + +/* -------------------------------------------------------------------- + * Drawing. + */ + +void BlenderGPUDisplay::clear() +{ + texture_.need_clear = true; +} + +void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms) +{ + /* See do_update_begin() for why no locking is required here. */ + const bool transparent = true; // TODO(sergey): Derive this from Film. + + if (texture_.need_clear) { + /* Texture is requested to be cleared and was not yet cleared. + * Do early return which should be equivalent of drawing all-zero texture. */ + return; + } + + if (!gl_draw_resources_ensure()) { + return; + } + + if (use_gl_context_) { + gl_context_mutex_.lock(); + } + + if (gl_upload_sync_) { + glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED); + } + + if (transparent) { + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } + + display_shader_->bind(params.full_size.x, params.full_size.y); + + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, texture_.gl_id); + + glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_); + + texture_update_if_needed(); + vertex_buffer_update(params); + + /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */ + GLuint vertex_array_object; + glGenVertexArrays(1, &vertex_array_object); + glBindVertexArray(vertex_array_object); + + const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location(); + const int position_attribute = display_shader_->get_position_attrib_location(); + + glEnableVertexAttribArray(texcoord_attribute); + glEnableVertexAttribArray(position_attribute); + + glVertexAttribPointer( + texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); + glVertexAttribPointer(position_attribute, + 2, + GL_FLOAT, + GL_FALSE, + 4 * sizeof(float), + (const GLvoid *)(sizeof(float) * 2)); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindTexture(GL_TEXTURE_2D, 0); + + glDeleteVertexArrays(1, &vertex_array_object); + + display_shader_->unbind(); + + if (transparent) { + glDisable(GL_BLEND); + } + + gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + glFlush(); + + if (use_gl_context_) { + gl_context_mutex_.unlock(); + } +} + +void BlenderGPUDisplay::gl_context_create() +{ + /* When rendering in viewport there is no render context available via engine. + * Check whether own context is to be created here. + * + * NOTE: If the `b_engine_`'s context is not available, we are expected to be on a main thread + * here. */ + use_gl_context_ = !RE_engine_has_render_context( + reinterpret_cast<RenderEngine *>(b_engine_.ptr.data)); + + if (use_gl_context_) { + const bool drw_state = DRW_opengl_context_release(); + gl_context_ = WM_opengl_context_create(); + if (gl_context_) { + /* On Windows an old context is restored after creation, and subsequent release of context + * generates a Win32 error. Harmless for users, but annoying to have possible misleading + * error prints in the console. */ +#ifndef _WIN32 + WM_opengl_context_release(gl_context_); +#endif + } + else { + LOG(ERROR) << "Error creating OpenGL context."; + } + + DRW_opengl_context_activate(drw_state); + } +} + +bool BlenderGPUDisplay::gl_context_enable() +{ + if (use_gl_context_) { + if (!gl_context_) { + return false; + } + gl_context_mutex_.lock(); + WM_opengl_context_activate(gl_context_); + return true; + } + + RE_engine_render_context_enable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data)); + return true; +} + +void BlenderGPUDisplay::gl_context_disable() +{ + if (use_gl_context_) { + if (gl_context_) { + WM_opengl_context_release(gl_context_); + gl_context_mutex_.unlock(); + } + return; + } + + RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data)); +} + +void BlenderGPUDisplay::gl_context_dispose() +{ + if (gl_context_) { + const bool drw_state = DRW_opengl_context_release(); + + WM_opengl_context_activate(gl_context_); + WM_opengl_context_dispose(gl_context_); + + DRW_opengl_context_activate(drw_state); + } +} + +bool BlenderGPUDisplay::gl_draw_resources_ensure() +{ + if (!texture_.gl_id) { + /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can + * can not continue. Note that this is not an unrecoverable error, so once the texture is known + * we will come back here and create all the GPU resources needed for draw. */ + return false; + } + + if (gl_draw_resource_creation_attempted_) { + return gl_draw_resources_created_; + } + gl_draw_resource_creation_attempted_ = true; + + if (!vertex_buffer_) { + glGenBuffers(1, &vertex_buffer_); + if (!vertex_buffer_) { + LOG(ERROR) << "Error creating vertex buffer."; + return false; + } + } + + gl_draw_resources_created_ = true; + + return true; +} + +void BlenderGPUDisplay::gl_resources_destroy() +{ + gl_context_enable(); + + if (vertex_buffer_ != 0) { + glDeleteBuffers(1, &vertex_buffer_); + } + + if (texture_.gl_pbo_id) { + glDeleteBuffers(1, &texture_.gl_pbo_id); + texture_.gl_pbo_id = 0; + } + + if (texture_.gl_id) { + glDeleteTextures(1, &texture_.gl_id); + texture_.gl_id = 0; + } + + gl_context_disable(); + + gl_context_dispose(); +} + +bool BlenderGPUDisplay::gl_texture_resources_ensure() +{ + if (texture_.creation_attempted) { + return texture_.is_created; + } + texture_.creation_attempted = true; + + DCHECK(!texture_.gl_id); + DCHECK(!texture_.gl_pbo_id); + + /* Create texture. */ + glGenTextures(1, &texture_.gl_id); + if (!texture_.gl_id) { + LOG(ERROR) << "Error creating texture."; + return false; + } + + /* Configure the texture. */ + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, texture_.gl_id); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); + + /* Create PBO for the texture. */ + glGenBuffers(1, &texture_.gl_pbo_id); + if (!texture_.gl_pbo_id) { + LOG(ERROR) << "Error creating texture pixel buffer object."; + return false; + } + + /* Creation finished with a success. */ + texture_.is_created = true; + + return true; +} + +void BlenderGPUDisplay::texture_update_if_needed() +{ + if (!texture_.need_update) { + return; + } + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id); + glTexSubImage2D( + GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + texture_.need_update = false; +} + +void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams ¶ms) +{ + /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be + * rendered. */ + glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); + + float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY)); + if (!vpointer) { + return; + } + + vpointer[0] = 0.0f; + vpointer[1] = 0.0f; + vpointer[2] = params.offset.x; + vpointer[3] = params.offset.y; + + vpointer[4] = 1.0f; + vpointer[5] = 0.0f; + vpointer[6] = (float)params.size.x + params.offset.x; + vpointer[7] = params.offset.y; + + vpointer[8] = 1.0f; + vpointer[9] = 1.0f; + vpointer[10] = (float)params.size.x + params.offset.x; + vpointer[11] = (float)params.size.y + params.offset.y; + + vpointer[12] = 0.0f; + vpointer[13] = 1.0f; + vpointer[14] = params.offset.x; + vpointer[15] = (float)params.size.y + params.offset.y; + + glUnmapBuffer(GL_ARRAY_BUFFER); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h new file mode 100644 index 00000000000..b7eddf0afa7 --- /dev/null +++ b/intern/cycles/blender/blender_gpu_display.h @@ -0,0 +1,211 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <atomic> + +#include "MEM_guardedalloc.h" + +#include "RNA_blender_cpp.h" + +#include "render/gpu_display.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +/* Base class of shader used for GPU display rendering. */ +class BlenderDisplayShader { + public: + static constexpr const char *position_attribute_name = "pos"; + static constexpr const char *tex_coord_attribute_name = "texCoord"; + + /* Create shader implementation suitable for the given render engine and scene configuration. */ + static unique_ptr<BlenderDisplayShader> create(BL::RenderEngine &b_engine, BL::Scene &b_scene); + + BlenderDisplayShader() = default; + virtual ~BlenderDisplayShader() = default; + + virtual void bind(int width, int height) = 0; + virtual void unbind() = 0; + + /* Get attribute location for position and texture coordinate respectively. + * NOTE: The shader needs to be bound to have access to those. */ + virtual int get_position_attrib_location(); + virtual int get_tex_coord_attrib_location(); + + protected: + /* Get program of this display shader. + * NOTE: The shader needs to be bound to have access to this. */ + virtual uint get_shader_program() = 0; + + /* Cached values of various OpenGL resources. */ + int position_attribute_location_ = -1; + int tex_coord_attribute_location_ = -1; +}; + +/* Implementation of display rendering shader used in the case when render engine does not support + * display space shader. */ +class BlenderFallbackDisplayShader : public BlenderDisplayShader { + public: + virtual void bind(int width, int height) override; + virtual void unbind() override; + + protected: + virtual uint get_shader_program() override; + + void create_shader_if_needed(); + void destroy_shader(); + + uint shader_program_ = 0; + int image_texture_location_ = -1; + int fullscreen_location_ = -1; + + /* Shader compilation attempted. Which means, that if the shader program is 0 then compilation or + * linking has failed. Do not attempt to re-compile the shader. */ + bool shader_compile_attempted_ = false; +}; + +class BlenderDisplaySpaceShader : public BlenderDisplayShader { + public: + BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene); + + virtual void bind(int width, int height) override; + virtual void unbind() override; + + protected: + virtual uint get_shader_program() override; + + BL::RenderEngine b_engine_; + BL::Scene &b_scene_; + + /* Cached values of various OpenGL resources. */ + uint shader_program_ = 0; +}; + +/* GPU display implementation which is specific for Blender viewport integration. */ +class BlenderGPUDisplay : public GPUDisplay { + public: + BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene); + ~BlenderGPUDisplay(); + + virtual void graphics_interop_activate() override; + virtual void graphics_interop_deactivate() override; + + virtual void clear() override; + + protected: + virtual bool do_update_begin(const GPUDisplayParams ¶ms, + int texture_width, + int texture_height) override; + virtual void do_update_end() override; + + virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels, + int texture_x, + int texture_y, + int pixels_width, + int pixels_height) override; + virtual void do_draw(const GPUDisplayParams ¶ms) override; + + virtual half4 *do_map_texture_buffer() override; + virtual void do_unmap_texture_buffer() override; + + virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override; + + /* Helper function which allocates new GPU context. */ + void gl_context_create(); + bool gl_context_enable(); + void gl_context_disable(); + void gl_context_dispose(); + + /* Make sure texture is allocated and its initial configuration is performed. */ + bool gl_texture_resources_ensure(); + + /* Ensure all runtime GPU resources needefd for drawing are allocated. + * Returns true if all resources needed for drawing are available. */ + bool gl_draw_resources_ensure(); + + /* Destroy all GPU resources which are being used by this object. */ + void gl_resources_destroy(); + + /* Update GPU texture dimensions and content if needed (new pixel data was provided). + * + * NOTE: The texture needs to be bound. */ + void texture_update_if_needed(); + + /* Update vetrex buffer with new coordinates of vertex positions and texture coordinates. + * This buffer is used to render texture in the viewport. + * + * NOTE: The buffer needs to be bound. */ + void vertex_buffer_update(const GPUDisplayParams ¶ms); + + BL::RenderEngine b_engine_; + + /* OpenGL context which is used the render engine doesn't have its own. */ + void *gl_context_ = nullptr; + /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create + * its own context. */ + bool use_gl_context_ = false; + /* Mutex used to guard the `gl_context_`. */ + thread_mutex gl_context_mutex_; + + /* Texture which contains pixels of the render result. */ + struct { + /* Indicates whether texture creation was attempted and succeeded. + * Used to avoid multiple attempts of texture creation on GPU issues or GPU context + * misconfiguration. */ + bool creation_attempted = false; + bool is_created = false; + + /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write + * pixels to it. + * + * NOTE: Allocated on the engine's context. */ + uint gl_id = 0; + uint gl_pbo_id = 0; + + /* Is true when new data was written to the PBO, meaning, the texture might need to be resized + * and new data is to be uploaded to the GPU. */ + bool need_update = false; + + /* Content of the texture is to be filled with zeroes. */ + std::atomic<bool> need_clear = true; + + /* Dimensions of the texture in pixels. */ + int width = 0; + int height = 0; + + /* Dimensions of the underlying PBO. */ + int buffer_width = 0; + int buffer_height = 0; + } texture_; + + unique_ptr<BlenderDisplayShader> display_shader_; + + /* Special track of whether GPU resources were attempted to be created, to avoid attempts of + * their re-creation on failure on every redraw. */ + bool gl_draw_resource_creation_attempted_ = false; + bool gl_draw_resources_created_ = false; + + /* Vertex buffer which hold vertrices of a triangle fan which is textures with the texture + * holding the render result. */ + uint vertex_buffer_ = 0; + + void *gl_render_sync_ = nullptr; + void *gl_upload_sync_ = nullptr; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp index 542028f4b2f..4df1e720dde 100644 --- a/intern/cycles/blender/blender_light.cpp +++ b/intern/cycles/blender/blender_light.cpp @@ -125,17 +125,10 @@ void BlenderSync::sync_light(BL::Object &b_parent, light->set_shader(static_cast<Shader *>(used_shaders[0])); /* shadow */ - PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles"); light->set_cast_shadow(get_boolean(clight, "cast_shadow")); light->set_use_mis(get_boolean(clight, "use_multiple_importance_sampling")); - int samples = get_int(clight, "samples"); - if (get_boolean(cscene, "use_square_samples")) - light->set_samples(samples * samples); - else - light->set_samples(samples); - light->set_max_bounces(get_int(clight, "max_bounces")); if (b_ob_info.real_object != b_ob_info.iter_object) { @@ -155,10 +148,12 @@ void BlenderSync::sync_light(BL::Object &b_parent, /* visibility */ uint visibility = object_ray_visibility(b_ob_info.real_object); + light->set_use_camera((visibility & PATH_RAY_CAMERA) != 0); light->set_use_diffuse((visibility & PATH_RAY_DIFFUSE) != 0); light->set_use_glossy((visibility & PATH_RAY_GLOSSY) != 0); light->set_use_transmission((visibility & PATH_RAY_TRANSMIT) != 0); light->set_use_scatter((visibility & PATH_RAY_VOLUME_SCATTER) != 0); + light->set_is_shadow_catcher(b_ob_info.real_object.is_shadow_catcher()); /* tag */ light->tag_update(scene); @@ -169,7 +164,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal) BL::World b_world = b_scene.world(); if (b_world) { - PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles"); enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM }; @@ -197,12 +191,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal) /* force enable light again when world is resynced */ light->set_is_enabled(true); - int samples = get_int(cworld, "samples"); - if (get_boolean(cscene, "use_square_samples")) - light->set_samples(samples * samples); - else - light->set_samples(samples); - light->tag_update(scene); light_map.set_recalc(b_world); } @@ -211,7 +199,7 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal) world_map = b_world.ptr.data; world_recalc = false; - viewport_parameters = BlenderViewportParameters(b_v3d); + viewport_parameters = BlenderViewportParameters(b_v3d, use_developer_ui); } CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index 22d6edeb099..95da4a2df84 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -568,7 +568,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph, /* object loop */ bool cancel = false; bool use_portal = false; - const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights; + const bool show_lights = BlenderViewportParameters(b_v3d, use_developer_ui).use_scene_lights; BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval(); BL::Depsgraph::object_instances_iterator b_instance_iter; diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 6e06b6a468f..694d8454422 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -45,10 +45,6 @@ # include <OSL/oslquery.h> #endif -#ifdef WITH_OPENCL -# include "device/device_intern.h" -#endif - CCL_NAMESPACE_BEGIN namespace { @@ -72,12 +68,10 @@ PyObject *pyunicode_from_string(const char *str) /* Synchronize debug flags from a given Blender scene. * Return truth when device list needs invalidation. */ -bool debug_flags_sync_from_scene(BL::Scene b_scene) +static void debug_flags_sync_from_scene(BL::Scene b_scene) { DebugFlagsRef flags = DebugFlags(); PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); - /* Backup some settings for comparison. */ - DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; /* Synchronize shared flags. */ flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type"); /* Synchronize CPU flags. */ @@ -87,50 +81,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout"); - flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); - flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); /* Synchronize OptiX flags. */ - flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams"); - flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api"); - /* Synchronize OpenCL device type. */ - switch (get_enum(cscene, "debug_opencl_device_type")) { - case 0: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE; - break; - case 1: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ALL; - break; - case 2: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_DEFAULT; - break; - case 3: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_CPU; - break; - case 4: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_GPU; - break; - case 5: - flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR; - break; - } - /* Synchronize other OpenCL flags. */ - flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug"); - flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit")) * 1024 * 1024; - return flags.opencl.device_type != opencl_device_type; + flags.optix.use_debug = get_boolean(cscene, "debug_use_optix_debug"); } /* Reset debug flags to default values. * Return truth when device list needs invalidation. */ -bool debug_flags_reset() +static void debug_flags_reset() { DebugFlagsRef flags = DebugFlags(); - /* Backup some settings for comparison. */ - DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; flags.reset(); - return flags.opencl.device_type != opencl_device_type; } } /* namespace */ @@ -175,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce) static PyObject *init_func(PyObject * /*self*/, PyObject *args) { - PyObject *path, *user_path; + PyObject *path, *user_path, *temp_path; int headless; - if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) { - return NULL; + if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) { + return nullptr; } - PyObject *path_coerce = NULL, *user_path_coerce = NULL; + PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr; path_init(PyC_UnicodeAsByte(path, &path_coerce), - PyC_UnicodeAsByte(user_path, &user_path_coerce)); + PyC_UnicodeAsByte(user_path, &user_path_coerce), + PyC_UnicodeAsByte(temp_path, &temp_path_coerce)); Py_XDECREF(path_coerce); Py_XDECREF(user_path_coerce); + Py_XDECREF(temp_path_coerce); BlenderSession::headless = headless; @@ -299,6 +264,50 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args) Py_RETURN_NONE; } +static PyObject *render_frame_finish_func(PyObject * /*self*/, PyObject *args) +{ + PyObject *pysession; + + if (!PyArg_ParseTuple(args, "O", &pysession)) { + return nullptr; + } + + BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession); + + /* Allow Blender to execute other Python scripts. */ + python_thread_state_save(&session->python_thread_state); + + session->render_frame_finish(); + + python_thread_state_restore(&session->python_thread_state); + + Py_RETURN_NONE; +} + +static PyObject *draw_func(PyObject * /*self*/, PyObject *args) +{ + PyObject *py_session, *py_graph, *py_screen, *py_space_image; + + if (!PyArg_ParseTuple(args, "OOOO", &py_session, &py_graph, &py_screen, &py_space_image)) { + return nullptr; + } + + BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(py_session); + + ID *b_screen = (ID *)PyLong_AsVoidPtr(py_screen); + + PointerRNA b_space_image_ptr; + RNA_pointer_create(b_screen, + &RNA_SpaceImageEditor, + pylong_as_voidptr_typesafe(py_space_image), + &b_space_image_ptr); + BL::SpaceImageEditor b_space_image(b_space_image_ptr); + + session->draw(b_space_image); + + Py_RETURN_NONE; +} + /* pixel_array and result passed as pointers */ static PyObject *bake_func(PyObject * /*self*/, PyObject *args) { @@ -336,7 +345,7 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args) Py_RETURN_NONE; } -static PyObject *draw_func(PyObject * /*self*/, PyObject *args) +static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args) { PyObject *pysession, *pygraph, *pyv3d, *pyrv3d; @@ -350,7 +359,7 @@ static PyObject *draw_func(PyObject * /*self*/, PyObject *args) int viewport[4]; glGetIntegerv(GL_VIEWPORT, viewport); - session->draw(viewport[2], viewport[3]); + session->view_draw(viewport[2], viewport[3]); } Py_RETURN_NONE; @@ -697,40 +706,6 @@ static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/) return pyunicode_from_string(system_info.c_str()); } -#ifdef WITH_OPENCL -static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/) -{ - VLOG(2) << "Disabling OpenCL platform."; - DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE; - Py_RETURN_NONE; -} - -static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args) -{ - PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence"); - if (sequence == NULL) { - Py_RETURN_FALSE; - } - - vector<string> parameters; - for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) { - PyObject *item = PySequence_Fast_GET_ITEM(sequence, i); - PyObject *item_as_string = PyObject_Str(item); - const char *parameter_string = PyUnicode_AsUTF8(item_as_string); - parameters.push_back(parameter_string); - Py_DECREF(item_as_string); - } - Py_DECREF(sequence); - - if (device_opencl_compile_kernel(parameters)) { - Py_RETURN_TRUE; - } - else { - Py_RETURN_FALSE; - } -} -#endif - static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepaths) { if (PyUnicode_Check(pyfilepaths)) { @@ -762,6 +737,10 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords) { +#if 1 + (void)args; + (void)keywords; +#else static const char *keyword_list[] = { "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL}; PyObject *pypreferences, *pyscene, *pyviewlayer; @@ -835,7 +814,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key } /* Create denoiser. */ - Denoiser denoiser(device); + DenoiserPipeline denoiser(device); denoiser.params = params; denoiser.input = input; denoiser.output = output; @@ -852,6 +831,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key PyErr_SetString(PyExc_ValueError, denoiser.error.c_str()); return NULL; } +#endif Py_RETURN_NONE; } @@ -903,10 +883,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args) RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr); BL::Scene b_scene(sceneptr); - if (debug_flags_sync_from_scene(b_scene)) { - VLOG(2) << "Tagging device list for update."; - Device::tag_update(); - } + debug_flags_sync_from_scene(b_scene); VLOG(2) << "Debug flags set to:\n" << DebugFlags(); @@ -917,10 +894,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args) static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/) { - if (debug_flags_reset()) { - VLOG(2) << "Tagging device list for update."; - Device::tag_update(); - } + debug_flags_reset(); if (debug_flags_set) { VLOG(2) << "Debug flags reset to:\n" << DebugFlags(); debug_flags_set = false; @@ -928,84 +902,6 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/ Py_RETURN_NONE; } -static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args) -{ - int num_resumable_chunks, current_resumable_chunk; - if (!PyArg_ParseTuple(args, "ii", &num_resumable_chunks, ¤t_resumable_chunk)) { - Py_RETURN_NONE; - } - - if (num_resumable_chunks <= 0) { - fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n"); - abort(); - Py_RETURN_NONE; - } - if (current_resumable_chunk < 1 || current_resumable_chunk > num_resumable_chunks) { - fprintf(stderr, "Cycles: Bad value for current resumable chunk number.\n"); - abort(); - Py_RETURN_NONE; - } - - VLOG(1) << "Initialized resumable render: " - << "num_resumable_chunks=" << num_resumable_chunks << ", " - << "current_resumable_chunk=" << current_resumable_chunk; - BlenderSession::num_resumable_chunks = num_resumable_chunks; - BlenderSession::current_resumable_chunk = current_resumable_chunk; - - printf("Cycles: Will render chunk %d of %d\n", current_resumable_chunk, num_resumable_chunks); - - Py_RETURN_NONE; -} - -static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args) -{ - int num_chunks, start_chunk, end_chunk; - if (!PyArg_ParseTuple(args, "iii", &num_chunks, &start_chunk, &end_chunk)) { - Py_RETURN_NONE; - } - - if (num_chunks <= 0) { - fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n"); - abort(); - Py_RETURN_NONE; - } - if (start_chunk < 1 || start_chunk > num_chunks) { - fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); - abort(); - Py_RETURN_NONE; - } - if (end_chunk < 1 || end_chunk > num_chunks) { - fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); - abort(); - Py_RETURN_NONE; - } - if (start_chunk > end_chunk) { - fprintf(stderr, "Cycles: End chunk should be higher than start one.\n"); - abort(); - Py_RETURN_NONE; - } - - VLOG(1) << "Initialized resumable render: " - << "num_resumable_chunks=" << num_chunks << ", " - << "start_resumable_chunk=" << start_chunk << "end_resumable_chunk=" << end_chunk; - BlenderSession::num_resumable_chunks = num_chunks; - BlenderSession::start_resumable_chunk = start_chunk; - BlenderSession::end_resumable_chunk = end_chunk; - - printf("Cycles: Will render chunks %d to %d of %d\n", start_chunk, end_chunk, num_chunks); - - Py_RETURN_NONE; -} - -static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/) -{ - VLOG(1) << "Clear resumable render"; - BlenderSession::num_resumable_chunks = 0; - BlenderSession::current_resumable_chunk = 0; - - Py_RETURN_NONE; -} - static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/) { BlenderSession::print_render_stats = true; @@ -1015,16 +911,14 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceType> device_types = Device::available_types(); - bool has_cuda = false, has_optix = false, has_opencl = false; + bool has_cuda = false, has_optix = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); - has_opencl |= (device_type == DEVICE_OPENCL); } - PyObject *list = PyTuple_New(3); + PyObject *list = PyTuple_New(2); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); - PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl)); return list; } @@ -1044,9 +938,6 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) if (override == "CPU") { BlenderSession::device_override = DEVICE_MASK_CPU; } - else if (override == "OPENCL") { - BlenderSession::device_override = DEVICE_MASK_OPENCL; - } else if (override == "CUDA") { BlenderSession::device_override = DEVICE_MASK_CUDA; } @@ -1072,8 +963,10 @@ static PyMethodDef methods[] = { {"create", create_func, METH_VARARGS, ""}, {"free", free_func, METH_O, ""}, {"render", render_func, METH_VARARGS, ""}, - {"bake", bake_func, METH_VARARGS, ""}, + {"render_frame_finish", render_frame_finish_func, METH_VARARGS, ""}, {"draw", draw_func, METH_VARARGS, ""}, + {"bake", bake_func, METH_VARARGS, ""}, + {"view_draw", view_draw_func, METH_VARARGS, ""}, {"sync", sync_func, METH_VARARGS, ""}, {"reset", reset_func, METH_VARARGS, ""}, #ifdef WITH_OSL @@ -1082,10 +975,6 @@ static PyMethodDef methods[] = { #endif {"available_devices", available_devices_func, METH_VARARGS, ""}, {"system_info", system_info_func, METH_NOARGS, ""}, -#ifdef WITH_OPENCL - {"opencl_disable", opencl_disable_func, METH_NOARGS, ""}, - {"opencl_compile", opencl_compile_func, METH_VARARGS, ""}, -#endif /* Standalone denoising */ {"denoise", (PyCFunction)denoise_func, METH_VARARGS | METH_KEYWORDS, ""}, @@ -1098,11 +987,6 @@ static PyMethodDef methods[] = { /* Statistics. */ {"enable_print_stats", enable_print_stats_func, METH_NOARGS, ""}, - /* Resumable render */ - {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""}, - {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""}, - {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""}, - /* Compute Device selection */ {"get_device_types", get_device_types_func, METH_VARARGS, ""}, {"set_device_override", set_device_override_func, METH_O, ""}, @@ -1153,14 +1037,6 @@ void *CCL_python_module_init() PyModule_AddStringConstant(mod, "osl_version_string", "unknown"); #endif -#ifdef WITH_NETWORK - PyModule_AddObject(mod, "with_network", Py_True); - Py_INCREF(Py_True); -#else /* WITH_NETWORK */ - PyModule_AddObject(mod, "with_network", Py_False); - Py_INCREF(Py_False); -#endif /* WITH_NETWORK */ - #ifdef WITH_EMBREE PyModule_AddObject(mod, "with_embree", Py_True); Py_INCREF(Py_True); diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 29de886e4ff..5aafa605526 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -38,9 +38,11 @@ #include "util/util_hash.h" #include "util/util_logging.h" #include "util/util_murmurhash.h" +#include "util/util_path.h" #include "util/util_progress.h" #include "util/util_time.h" +#include "blender/blender_gpu_display.h" #include "blender/blender_session.h" #include "blender/blender_sync.h" #include "blender/blender_util.h" @@ -49,10 +51,6 @@ CCL_NAMESPACE_BEGIN DeviceTypeMask BlenderSession::device_override = DEVICE_MASK_ALL; bool BlenderSession::headless = false; -int BlenderSession::num_resumable_chunks = 0; -int BlenderSession::current_resumable_chunk = 0; -int BlenderSession::start_resumable_chunk = 0; -int BlenderSession::end_resumable_chunk = 0; bool BlenderSession::print_render_stats = false; BlenderSession::BlenderSession(BL::RenderEngine &b_engine, @@ -103,7 +101,9 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine, width(width), height(height), preview_osl(false), - python_thread_state(NULL) + python_thread_state(NULL), + use_developer_ui(b_userpref.experimental().use_cycles_debug() && + b_userpref.view().show_developer_ui()) { /* 3d view render */ background = false; @@ -119,10 +119,10 @@ BlenderSession::~BlenderSession() void BlenderSession::create_session() { - SessionParams session_params = BlenderSync::get_session_params( + const SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); - bool session_pause = BlenderSync::get_session_pause(b_scene, background); + const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); + const bool session_pause = BlenderSync::get_session_pause(b_scene, background); /* reset status/progress */ last_status = ""; @@ -131,20 +131,18 @@ void BlenderSession::create_session() start_resize_time = 0.0; /* create session */ - session = new Session(session_params); - session->scene = scene; + session = new Session(session_params, scene_params); session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this)); session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this)); session->set_pause(session_pause); /* create scene */ - scene = new Scene(scene_params, session->device); + scene = session->scene; scene->name = b_scene.name(); - session->scene = scene; - /* create sync */ - sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress); + sync = new BlenderSync( + b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress); BL::Object b_camera_override(b_engine.camera_override()); if (b_v3d) { sync->sync_view(b_v3d, b_rv3d, width, height); @@ -154,13 +152,23 @@ void BlenderSession::create_session() } /* set buffer parameters */ - BufferParams buffer_params = BlenderSync::get_buffer_params( - b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); - session->reset(buffer_params, session_params.samples); + const BufferParams buffer_params = BlenderSync::get_buffer_params( + b_v3d, b_rv3d, scene->camera, width, height); + session->reset(session_params, buffer_params); - b_engine.use_highlight_tiles(session_params.progressive_refine == false); + /* Create GPU display. */ + if (!b_engine.is_preview() && !headless) { + session->set_gpu_display(make_unique<BlenderGPUDisplay>(b_engine, b_scene)); + } - update_resumable_tile_manager(session_params.samples); + /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform + * engine that no tracking of the tiles state is needed. + * The offline rendering will make a decision when tile is being written. The penalty of asking + * the engine to keep track of tiles state is minimal, so there is nothing to worry about here + * about possible single-tiled final render. */ + if (!b_engine.is_preview() && !b_v3d) { + b_engine.use_highlight_tiles(true); + } } void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph) @@ -202,9 +210,9 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg return; } - SessionParams session_params = BlenderSync::get_session_params( + const SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); + const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); if (scene->params.modified(scene_params) || session->params.modified(session_params) || !this->b_render.use_persistent_data()) { @@ -220,8 +228,6 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg session->progress.reset(); - session->tile_manager.set_tile_order(session_params.tile_order); - /* peak memory usage should show current render peak, not peak for all renders * made by this render session */ @@ -230,7 +236,8 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg if (is_new_session) { /* Sync object should be re-created for new scene. */ delete sync; - sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress); + sync = new BlenderSync( + b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress); } else { /* Sync recalculations to do just the required updates. */ @@ -242,103 +249,85 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL); BL::RegionView3D b_null_region_view3d(PointerRNA_NULL); - BufferParams buffer_params = BlenderSync::get_buffer_params(b_null_space_view3d, - b_null_region_view3d, - scene->camera, - width, - height, - session_params.denoising.use); - session->reset(buffer_params, session_params.samples); - - b_engine.use_highlight_tiles(session_params.progressive_refine == false); + const BufferParams buffer_params = BlenderSync::get_buffer_params( + b_null_space_view3d, b_null_region_view3d, scene->camera, width, height); + session->reset(session_params, buffer_params); /* reset time */ start_resize_time = 0.0; + + { + thread_scoped_lock lock(draw_state_.mutex); + draw_state_.last_pass_index = -1; + } } void BlenderSession::free_session() { - session->cancel(); + if (session) { + session->cancel(true); + } delete sync; + sync = nullptr; + delete session; + session = nullptr; } -static ShaderEvalType get_shader_type(const string &pass_type) +void BlenderSession::read_render_tile() { - const char *shader_type = pass_type.c_str(); + const int2 tile_offset = session->get_render_tile_offset(); + const int2 tile_size = session->get_render_tile_size(); - /* data passes */ - if (strcmp(shader_type, "NORMAL") == 0) - return SHADER_EVAL_NORMAL; - else if (strcmp(shader_type, "UV") == 0) - return SHADER_EVAL_UV; - else if (strcmp(shader_type, "ROUGHNESS") == 0) - return SHADER_EVAL_ROUGHNESS; - else if (strcmp(shader_type, "DIFFUSE_COLOR") == 0) - return SHADER_EVAL_DIFFUSE_COLOR; - else if (strcmp(shader_type, "GLOSSY_COLOR") == 0) - return SHADER_EVAL_GLOSSY_COLOR; - else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0) - return SHADER_EVAL_TRANSMISSION_COLOR; - else if (strcmp(shader_type, "EMIT") == 0) - return SHADER_EVAL_EMISSION; + /* get render result */ + BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x, + tile_offset.y, + tile_size.x, + tile_size.y, + b_rlay_name.c_str(), + b_rview_name.c_str()); - /* light passes */ - else if (strcmp(shader_type, "AO") == 0) - return SHADER_EVAL_AO; - else if (strcmp(shader_type, "COMBINED") == 0) - return SHADER_EVAL_COMBINED; - else if (strcmp(shader_type, "SHADOW") == 0) - return SHADER_EVAL_SHADOW; - else if (strcmp(shader_type, "DIFFUSE") == 0) - return SHADER_EVAL_DIFFUSE; - else if (strcmp(shader_type, "GLOSSY") == 0) - return SHADER_EVAL_GLOSSY; - else if (strcmp(shader_type, "TRANSMISSION") == 0) - return SHADER_EVAL_TRANSMISSION; + /* can happen if the intersected rectangle gives 0 width or height */ + if (b_rr.ptr.data == NULL) { + return; + } - /* extra */ - else if (strcmp(shader_type, "ENVIRONMENT") == 0) - return SHADER_EVAL_ENVIRONMENT; + BL::RenderResult::layers_iterator b_single_rlay; + b_rr.layers.begin(b_single_rlay); - else - return SHADER_EVAL_BAKE; -} + /* layer will be missing if it was disabled in the UI */ + if (b_single_rlay == b_rr.layers.end()) + return; -static BL::RenderResult begin_render_result(BL::RenderEngine &b_engine, - int x, - int y, - int w, - int h, - const char *layername, - const char *viewname) -{ - return b_engine.begin_result(x, y, w, h, layername, viewname); -} + BL::RenderLayer b_rlay = *b_single_rlay; -static void end_render_result(BL::RenderEngine &b_engine, - BL::RenderResult &b_rr, - bool cancel, - bool highlight, - bool do_merge_results) -{ - b_engine.end_result(b_rr, (int)cancel, (int)highlight, (int)do_merge_results); + vector<float> pixels(tile_size.x * tile_size.y * 4); + + /* Copy each pass. + * TODO:copy only the required ones for better performance? */ + for (BL::RenderPass &b_pass : b_rlay.passes) { + session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect()); + } } -void BlenderSession::do_write_update_render_tile(RenderTile &rtile, - bool do_update_only, - bool do_read_only, - bool highlight) +void BlenderSession::write_render_tile() { - int x = rtile.x - session->tile_manager.params.full_x; - int y = rtile.y - session->tile_manager.params.full_y; - int w = rtile.w; - int h = rtile.h; + const int2 tile_offset = session->get_render_tile_offset(); + const int2 tile_size = session->get_render_tile_size(); + + const string_view render_layer_name = session->get_render_tile_layer(); + const string_view render_view_name = session->get_render_tile_view(); + + b_engine.tile_highlight_clear_all(); /* get render result */ - BL::RenderResult b_rr = begin_render_result( - b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str()); + BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x, + tile_offset.y, + tile_size.x, + tile_size.y, + render_layer_name.c_str(), + render_view_name.c_str()); /* can happen if the intersected rectangle gives 0 width or height */ if (b_rr.ptr.data == NULL) { @@ -349,64 +338,34 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile, b_rr.layers.begin(b_single_rlay); /* layer will be missing if it was disabled in the UI */ - if (b_single_rlay == b_rr.layers.end()) + if (b_single_rlay == b_rr.layers.end()) { return; + } BL::RenderLayer b_rlay = *b_single_rlay; - if (do_read_only) { - /* copy each pass */ - for (BL::RenderPass &b_pass : b_rlay.passes) { - /* find matching pass type */ - PassType pass_type = BlenderSync::get_pass_type(b_pass); - int components = b_pass.channels(); - - rtile.buffers->set_pass_rect( - pass_type, components, (float *)b_pass.rect(), rtile.num_samples); - } - - end_render_result(b_engine, b_rr, false, false, false); - } - else if (do_update_only) { - /* Sample would be zero at initial tile update, which is only needed - * to tag tile form blender side as IN PROGRESS for proper highlight - * no buffers should be sent to blender yet. For denoise we also - * keep showing the noisy buffers until denoise is done. */ - bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE); + write_render_result(b_rlay); - if (merge) { - update_render_result(b_rlay, rtile); - } - - end_render_result(b_engine, b_rr, true, highlight, merge); - } - else { - /* Write final render result. */ - write_render_result(b_rlay, rtile); - end_render_result(b_engine, b_rr, false, false, true); - } + b_engine.end_result(b_rr, true, false, true); } -void BlenderSession::read_render_tile(RenderTile &rtile) +void BlenderSession::update_render_tile() { - do_write_update_render_tile(rtile, false, true, false); -} + if (!session->has_multiple_render_tiles()) { + /* Don't highlight full-frame tile. */ + return; + } -void BlenderSession::write_render_tile(RenderTile &rtile) -{ - do_write_update_render_tile(rtile, false, false, false); + const int2 tile_offset = session->get_render_tile_offset(); + const int2 tile_size = session->get_render_tile_size(); + + b_engine.tile_highlight_clear_all(); + b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true); } -void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight) +void BlenderSession::full_buffer_written(string_view filename) { - /* use final write for preview renders, otherwise render result wouldn't be - * be updated in blender side - * would need to be investigated a bit further, but for now shall be fine - */ - if (!b_engine.is_preview()) - do_write_update_render_tile(rtile, true, false, highlight); - else - do_write_update_render_tile(rtile, false, false, false); + full_buffer_files_.emplace_back(filename); } static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest) @@ -430,12 +389,15 @@ void BlenderSession::stamp_view_layer_metadata(Scene *scene, const string &view_ to_string(session->params.samples).c_str()); /* Store ranged samples information. */ + /* TODO(sergey): Need to bring this information back. */ +#if 0 if (session->tile_manager.range_num_samples != -1) { b_rr.stamp_data_add_field((prefix + "range_start_sample").c_str(), to_string(session->tile_manager.range_start_sample).c_str()); b_rr.stamp_data_add_field((prefix + "range_num_samples").c_str(), to_string(session->tile_manager.range_num_samples).c_str()); } +#endif /* Write cryptomatte metadata. */ if (scene->film->get_cryptomatte_passes() & CRYPT_OBJECT) { @@ -475,38 +437,44 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) } /* set callback to write out render results */ - session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1); - session->update_render_tile_cb = function_bind( - &BlenderSession::update_render_tile, this, _1, _2); + session->write_render_tile_cb = [&]() { write_render_tile(); }; + + /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender + * side. */ + /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */ + if (b_engine.is_preview()) { + session->update_render_tile_cb = [&]() { write_render_tile(); }; + } + else { + session->update_render_tile_cb = [&]() { update_render_tile(); }; + } + + session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); }; BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval(); /* get buffer parameters */ - SessionParams session_params = BlenderSync::get_session_params( - b_engine, b_userpref, b_scene, background, b_view_layer); + const SessionParams session_params = BlenderSync::get_session_params( + b_engine, b_userpref, b_scene, background); BufferParams buffer_params = BlenderSync::get_buffer_params( - b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); + b_v3d, b_rv3d, scene->camera, width, height); /* temporary render result to find needed passes and views */ - BL::RenderResult b_rr = begin_render_result( - b_engine, 0, 0, 1, 1, b_view_layer.name().c_str(), NULL); + BL::RenderResult b_rr = b_engine.begin_result(0, 0, 1, 1, b_view_layer.name().c_str(), NULL); BL::RenderResult::layers_iterator b_single_rlay; b_rr.layers.begin(b_single_rlay); BL::RenderLayer b_rlay = *b_single_rlay; - b_rlay_name = b_view_layer.name(); - /* Update denoising parameters. */ - session->set_denoising(session_params.denoising); + { + thread_scoped_lock lock(draw_state_.mutex); + b_rlay_name = b_view_layer.name(); - /* Compute render passes and film settings. */ - vector<Pass> passes = sync->sync_render_passes( - b_scene, b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising); + /* Signal that the display pass is to be updated. */ + draw_state_.last_pass_index = -1; + } - /* Set buffer params, using film settings from sync_render_passes. */ - buffer_params.passes = passes; - buffer_params.denoising_data_pass = scene->film->get_denoising_data_pass(); - buffer_params.denoising_clean_pass = scene->film->get_denoising_clean_pass(); - buffer_params.denoising_prefiltered_pass = scene->film->get_denoising_prefiltered_pass(); + /* Compute render passes and film settings. */ + sync->sync_render_passes(b_rlay, b_view_layer); BL::RenderResult::views_iterator b_view_iter; @@ -520,6 +488,9 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) ++b_view_iter, ++view_index) { b_rview_name = b_view_iter->name(); + buffer_params.layer = b_view_layer.name(); + buffer_params.view = b_rview_name; + /* set the current view */ b_engine.active_view_set(b_rview_name.c_str()); @@ -549,20 +520,16 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) } /* Update number of samples per layer. */ - int samples = sync->get_layer_samples(); - bool bound_samples = sync->get_layer_bound_samples(); - int effective_layer_samples; + const int samples = sync->get_layer_samples(); + const bool bound_samples = sync->get_layer_bound_samples(); - if (samples != 0 && (!bound_samples || (samples < session_params.samples))) - effective_layer_samples = samples; - else - effective_layer_samples = session_params.samples; - - /* Update tile manager if we're doing resumable render. */ - update_resumable_tile_manager(effective_layer_samples); + SessionParams effective_session_params = session_params; + if (samples != 0 && (!bound_samples || (samples < session_params.samples))) { + effective_session_params.samples = samples; + } /* Update session itself. */ - session->reset(buffer_params, effective_layer_samples); + session->reset(effective_session_params, buffer_params); /* render */ if (!b_engine.is_preview() && background && print_render_stats) { @@ -586,65 +553,146 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) stamp_view_layer_metadata(scene, b_rlay_name); /* free result without merging */ - end_render_result(b_engine, b_rr, true, true, false); + b_engine.end_result(b_rr, true, false, false); double total_time, render_time; session->progress.get_time(total_time, render_time); VLOG(1) << "Total render time: " << total_time; VLOG(1) << "Render time (without synchronization): " << render_time; +} + +void BlenderSession::render_frame_finish() +{ + /* Processing of all layers and views is done. Clear the strings so that we can communicate + * progress about reading files and denoising them. */ + b_rlay_name = ""; + b_rview_name = ""; + + if (!b_render.use_persistent_data()) { + /* Free the sync object so that it can properly dereference nodes from the scene graph before + * the graph is freed. */ + delete sync; + sync = nullptr; + + session->device_free(); + } + + for (string_view filename : full_buffer_files_) { + session->process_full_buffer_from_disk(filename); + path_remove(filename); + } /* clear callback */ session->write_render_tile_cb = function_null; session->update_render_tile_cb = function_null; + session->full_buffer_written_cb = function_null; } -static int bake_pass_filter_get(const int pass_filter) +static PassType bake_type_to_pass(const string &bake_type_str, const int bake_filter) { - int flag = BAKE_FILTER_NONE; - - if ((pass_filter & BL::BakeSettings::pass_filter_DIRECT) != 0) - flag |= BAKE_FILTER_DIRECT; - if ((pass_filter & BL::BakeSettings::pass_filter_INDIRECT) != 0) - flag |= BAKE_FILTER_INDIRECT; - if ((pass_filter & BL::BakeSettings::pass_filter_COLOR) != 0) - flag |= BAKE_FILTER_COLOR; - - if ((pass_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0) - flag |= BAKE_FILTER_DIFFUSE; - if ((pass_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0) - flag |= BAKE_FILTER_GLOSSY; - if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0) - flag |= BAKE_FILTER_TRANSMISSION; - - if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0) - flag |= BAKE_FILTER_EMISSION; - if ((pass_filter & BL::BakeSettings::pass_filter_AO) != 0) - flag |= BAKE_FILTER_AO; - - return flag; + const char *bake_type = bake_type_str.c_str(); + + /* data passes */ + if (strcmp(bake_type, "POSITION") == 0) { + return PASS_POSITION; + } + else if (strcmp(bake_type, "NORMAL") == 0) { + return PASS_NORMAL; + } + else if (strcmp(bake_type, "UV") == 0) { + return PASS_UV; + } + else if (strcmp(bake_type, "ROUGHNESS") == 0) { + return PASS_ROUGHNESS; + } + else if (strcmp(bake_type, "EMIT") == 0) { + return PASS_EMISSION; + } + /* light passes */ + else if (strcmp(bake_type, "AO") == 0) { + return PASS_AO; + } + else if (strcmp(bake_type, "COMBINED") == 0) { + return PASS_COMBINED; + } + else if (strcmp(bake_type, "SHADOW") == 0) { + return PASS_SHADOW; + } + else if (strcmp(bake_type, "DIFFUSE") == 0) { + if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) && + bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_DIFFUSE; + } + else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) { + return PASS_DIFFUSE_DIRECT; + } + else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_DIFFUSE_INDIRECT; + } + else { + return PASS_DIFFUSE_COLOR; + } + } + else if (strcmp(bake_type, "GLOSSY") == 0) { + if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) && + bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_GLOSSY; + } + else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) { + return PASS_GLOSSY_DIRECT; + } + else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_GLOSSY_INDIRECT; + } + else { + return PASS_GLOSSY_COLOR; + } + } + else if (strcmp(bake_type, "TRANSMISSION") == 0) { + if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) && + bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_TRANSMISSION; + } + else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) { + return PASS_TRANSMISSION_DIRECT; + } + else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) { + return PASS_TRANSMISSION_INDIRECT; + } + else { + return PASS_TRANSMISSION_COLOR; + } + } + /* extra */ + else if (strcmp(bake_type, "ENVIRONMENT") == 0) { + return PASS_BACKGROUND; + } + + return PASS_COMBINED; } void BlenderSession::bake(BL::Depsgraph &b_depsgraph_, BL::Object &b_object, - const string &pass_type, - const int pass_filter, + const string &bake_type, + const int bake_filter, const int bake_width, const int bake_height) { b_depsgraph = b_depsgraph_; - ShaderEvalType shader_type = get_shader_type(pass_type); - int bake_pass_filter = bake_pass_filter_get(pass_filter); - /* Initialize bake manager, before we load the baking kernels. */ - scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter); + scene->bake_manager->set(scene, b_object.name()); - /* Passes are identified by name, so in order to return the combined pass we need to set the - * name. */ - Pass::add(PASS_COMBINED, scene->passes, "Combined"); + /* Add render pass that we want to bake, and name it Combined so that it is + * used as that on the Blender side. */ + Pass *pass = scene->create_node<Pass>(); + pass->set_name(ustring("Combined")); + pass->set_type(bake_type_to_pass(bake_type, bake_filter)); + pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR)); - session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1); - session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1); + session->read_render_tile_cb = [&]() { read_render_tile(); }; + session->write_render_tile_cb = [&]() { write_render_tile(); }; + session->set_gpu_display(nullptr); if (!session->progress.get_cancel()) { /* Sync scene. */ @@ -667,18 +715,15 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_, if (object_found && !session->progress.get_cancel()) { /* Get session and buffer parameters. */ - SessionParams session_params = BlenderSync::get_session_params( + const SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); - session_params.progressive_refine = false; BufferParams buffer_params; buffer_params.width = bake_width; buffer_params.height = bake_height; - buffer_params.passes = scene->passes; /* Update session. */ - session->tile_manager.set_samples(session_params.samples); - session->reset(buffer_params, session_params.samples); + session->reset(session_params, buffer_params); session->progress.set_update_callback( function_bind(&BlenderSession::update_bake_progress, this)); @@ -690,71 +735,43 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_, session->wait(); } - session->read_bake_tile_cb = function_null; + session->read_render_tile_cb = function_null; session->write_render_tile_cb = function_null; } -void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay, - RenderTile &rtile, - bool do_update_only) +void BlenderSession::write_render_result(BL::RenderLayer &b_rlay) { - RenderBuffers *buffers = rtile.buffers; - - /* copy data from device */ - if (!buffers->copy_from_device()) + if (!session->copy_render_tile_from_device()) { return; - - float exposure = scene->film->get_exposure(); - - vector<float> pixels(rtile.w * rtile.h * 4); - - /* Adjust absolute sample number to the range. */ - int sample = rtile.sample; - const int range_start_sample = session->tile_manager.range_start_sample; - if (range_start_sample != -1) { - sample -= range_start_sample; } - if (!do_update_only) { - /* copy each pass */ - for (BL::RenderPass &b_pass : b_rlay.passes) { - int components = b_pass.channels(); - - /* Copy pixels from regular render passes. */ - bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]); - - /* If denoising pass, */ - if (!read) { - int denoising_offset = BlenderSync::get_denoising_pass(b_pass); - if (denoising_offset >= 0) { - read = buffers->get_denoising_pass_rect( - denoising_offset, exposure, sample, components, &pixels[0]); - } - } + const int2 tile_size = session->get_render_tile_size(); + vector<float> pixels(tile_size.x * tile_size.y * 4); - if (!read) { - memset(&pixels[0], 0, pixels.size() * sizeof(float)); - } - - b_pass.rect(&pixels[0]); + /* Copy each pass. */ + for (BL::RenderPass &b_pass : b_rlay.passes) { + if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) { + memset(&pixels[0], 0, pixels.size() * sizeof(float)); } - } - else { - /* copy combined pass */ - BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); - if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0])) - b_combined_pass.rect(&pixels[0]); + + b_pass.rect(&pixels[0]); } } -void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile) +void BlenderSession::update_render_result(BL::RenderLayer &b_rlay) { - do_write_update_render_result(b_rlay, rtile, false); -} + if (!session->copy_render_tile_from_device()) { + return; + } -void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile) -{ - do_write_update_render_result(b_rlay, rtile, true); + const int2 tile_size = session->get_render_tile_size(); + vector<float> pixels(tile_size.x * tile_size.y * 4); + + /* Copy combined pass. */ + BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); + if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) { + b_combined_pass.rect(&pixels[0]); + } } void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) @@ -764,19 +781,19 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) return; /* on session/scene parameter changes, we recreate session entirely */ - SessionParams session_params = BlenderSync::get_session_params( + const SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); - bool session_pause = BlenderSync::get_session_pause(b_scene, background); + const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); + const bool session_pause = BlenderSync::get_session_pause(b_scene, background); if (session->params.modified(session_params) || scene->params.modified(scene_params)) { free_session(); create_session(); } - /* increase samples, but never decrease */ + /* increase samples and render time, but never decrease */ session->set_samples(session_params.samples); - session->set_denoising_start_sample(session_params.denoising.start_sample); + session->set_time_limit(session_params.time_limit); session->set_pause(session_pause); /* copy recalc flags, outside of mutex so we can decide to do the real @@ -808,21 +825,12 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) sync->sync_camera(b_render, b_camera_override, width, height, ""); /* get buffer parameters */ - BufferParams buffer_params = BlenderSync::get_buffer_params( - b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); - - if (!buffer_params.denoising_data_pass) { - session_params.denoising.use = false; - } - - session->set_denoising(session_params.denoising); - - /* Update film if denoising data was enabled or disabled. */ - scene->film->set_denoising_data_pass(buffer_params.denoising_data_pass); + const BufferParams buffer_params = BlenderSync::get_buffer_params( + b_v3d, b_rv3d, scene->camera, width, height); /* reset if needed */ if (scene->need_reset()) { - session->reset(buffer_params, session_params.samples); + session->reset(session_params, buffer_params); /* After session reset, so device is not accessing image data anymore. */ builtin_images_load(); @@ -839,7 +847,41 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) session->start(); } -bool BlenderSession::draw(int w, int h) +void BlenderSession::draw(BL::SpaceImageEditor &space_image) +{ + if (!session || !session->scene) { + /* Offline render drawing does not force the render engine update, which means it's possible + * that the Session is not created yet. */ + return; + } + + thread_scoped_lock lock(draw_state_.mutex); + + const int pass_index = space_image.image_user().multilayer_pass(); + if (pass_index != draw_state_.last_pass_index) { + BL::RenderPass b_display_pass(b_engine.pass_by_index_get(b_rlay_name.c_str(), pass_index)); + if (!b_display_pass) { + return; + } + + Scene *scene = session->scene; + + thread_scoped_lock lock(scene->mutex); + + const Pass *pass = Pass::find(scene->passes, b_display_pass.name()); + if (!pass) { + return; + } + + scene->film->set_display_pass(pass->get_type()); + + draw_state_.last_pass_index = pass_index; + } + + session->draw(); +} + +void BlenderSession::view_draw(int w, int h) { /* pause in redraw in case update is not being called due to final render */ session->set_pause(BlenderSync::get_session_pause(b_scene, background)); @@ -885,14 +927,14 @@ bool BlenderSession::draw(int w, int h) /* reset if requested */ if (reset) { - SessionParams session_params = BlenderSync::get_session_params( + const SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); - BufferParams buffer_params = BlenderSync::get_buffer_params( - b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); - bool session_pause = BlenderSync::get_session_pause(b_scene, background); + const BufferParams buffer_params = BlenderSync::get_buffer_params( + b_v3d, b_rv3d, scene->camera, width, height); + const bool session_pause = BlenderSync::get_session_pause(b_scene, background); if (session_pause == false) { - session->reset(buffer_params, session_params.samples); + session->reset(session_params, buffer_params); start_resize_time = 0.0; } } @@ -905,18 +947,7 @@ bool BlenderSession::draw(int w, int h) update_status_progress(); /* draw */ - BufferParams buffer_params = BlenderSync::get_buffer_params( - b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use); - DeviceDrawParams draw_params; - - if (session->params.display_buffer_linear) { - draw_params.bind_display_space_shader_cb = function_bind( - &BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene); - draw_params.unbind_display_space_shader_cb = function_bind( - &BL::RenderEngine::unbind_display_space_shader, &b_engine); - } - - return !session->draw(buffer_params, draw_params); + session->draw(); } void BlenderSession::get_status(string &status, string &substatus) @@ -924,11 +955,6 @@ void BlenderSession::get_status(string &status, string &substatus) session->progress.get_status(status, substatus); } -void BlenderSession::get_kernel_status(string &kernel_status) -{ - session->progress.get_kernel_status(kernel_status); -} - void BlenderSession::get_progress(float &progress, double &total_time, double &render_time) { session->progress.get_time(total_time, render_time); @@ -947,7 +973,7 @@ void BlenderSession::update_bake_progress() void BlenderSession::update_status_progress() { - string timestatus, status, substatus, kernel_status; + string timestatus, status, substatus; string scene_status = ""; float progress; double total_time, remaining_time = 0, render_time; @@ -955,7 +981,6 @@ void BlenderSession::update_status_progress() float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f; get_status(status, substatus); - get_kernel_status(kernel_status); get_progress(progress, total_time, render_time); if (progress > 0) @@ -980,14 +1005,12 @@ void BlenderSession::update_status_progress() status = " | " + status; if (substatus.size() > 0) status += " | " + substatus; - if (kernel_status.size() > 0) - status += " | " + kernel_status; } double current_time = time_dt(); - /* When rendering in a window, redraw the status at least once per second to keep the elapsed and - * remaining time up-to-date. For headless rendering, only report when something significant - * changes to keep the console output readable. */ + /* When rendering in a window, redraw the status at least once per second to keep the elapsed + * and remaining time up-to-date. For headless rendering, only report when something + * significant changes to keep the console output readable. */ if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) { b_engine.update_stats("", (timestatus + scene_status + status).c_str()); b_engine.update_memory_stats(mem_used, mem_peak); @@ -1048,56 +1071,6 @@ void BlenderSession::test_cancel() session->progress.set_cancel("Cancelled"); } -void BlenderSession::update_resumable_tile_manager(int num_samples) -{ - const int num_resumable_chunks = BlenderSession::num_resumable_chunks, - current_resumable_chunk = BlenderSession::current_resumable_chunk; - if (num_resumable_chunks == 0) { - return; - } - - if (num_resumable_chunks > num_samples) { - fprintf(stderr, - "Cycles warning: more sample chunks (%d) than samples (%d), " - "this will cause some samples to be included in multiple chunks.\n", - num_resumable_chunks, - num_samples); - } - - const float num_samples_per_chunk = (float)num_samples / num_resumable_chunks; - - float range_start_sample, range_num_samples; - if (current_resumable_chunk != 0) { - /* Single chunk rendering. */ - range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1); - range_num_samples = num_samples_per_chunk; - } - else { - /* Ranged-chunks. */ - const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1; - range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1); - range_num_samples = num_chunks * num_samples_per_chunk; - } - - /* Round after doing the multiplications with num_chunks and num_samples_per_chunk - * to allow for many small chunks. */ - int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f); - int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1); - - /* Make sure we don't overshoot. */ - if (rounded_range_start_sample + rounded_range_num_samples > num_samples) { - rounded_range_num_samples = num_samples - rounded_range_num_samples; - } - - VLOG(1) << "Samples range start is " << range_start_sample << ", " - << "number of samples to render is " << range_num_samples; - - scene->integrator->set_start_sample(rounded_range_start_sample); - - session->tile_manager.range_start_sample = rounded_range_start_sample; - session->tile_manager.range_num_samples = rounded_range_num_samples; -} - void BlenderSession::free_blender_memory_if_possible() { if (!background) { diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index d967b81c854..cf52359ea5d 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -33,8 +33,6 @@ class BlenderSync; class ImageMetaData; class Scene; class Session; -class RenderBuffers; -class RenderTile; class BlenderSession { public: @@ -62,6 +60,8 @@ class BlenderSession { /* offline render */ void render(BL::Depsgraph &b_depsgraph); + void render_frame_finish(); + void bake(BL::Depsgraph &b_depsgrah, BL::Object &b_object, const string &pass_type, @@ -69,24 +69,29 @@ class BlenderSession { const int bake_width, const int bake_height); - void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile); - void write_render_tile(RenderTile &rtile); - void read_render_tile(RenderTile &rtile); + void write_render_result(BL::RenderLayer &b_rlay); + void write_render_tile(); + + void update_render_tile(); + + void full_buffer_written(string_view filename); /* update functions are used to update display buffer only after sample was rendered * only needed for better visual feedback */ - void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile); - void update_render_tile(RenderTile &rtile, bool highlight); + void update_render_result(BL::RenderLayer &b_rlay); + + /* read functions for baking input */ + void read_render_tile(); /* interactive updates */ void synchronize(BL::Depsgraph &b_depsgraph); /* drawing */ - bool draw(int w, int h); + void draw(BL::SpaceImageEditor &space_image); + void view_draw(int w, int h); void tag_redraw(); void tag_update(); void get_status(string &status, string &substatus); - void get_kernel_status(string &kernel_status); void get_progress(float &progress, double &total_time, double &render_time); void test_cancel(); void update_status_progress(); @@ -123,6 +128,8 @@ class BlenderSession { void *python_thread_state; + bool use_developer_ui; + /* Global state which is common for all render sessions created from Blender. * Usually denotes command line arguments. */ @@ -134,41 +141,25 @@ class BlenderSession { */ static bool headless; - /* ** Resumable render ** */ - - /* Overall number of chunks in which the sample range is to be divided. */ - static int num_resumable_chunks; - - /* Current resumable chunk index to render. */ - static int current_resumable_chunk; - - /* Alternative to single-chunk rendering to render a range of chunks. */ - static int start_resumable_chunk; - static int end_resumable_chunk; - static bool print_render_stats; protected: void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name); - void do_write_update_render_result(BL::RenderLayer &b_rlay, - RenderTile &rtile, - bool do_update_only); - void do_write_update_render_tile(RenderTile &rtile, - bool do_update_only, - bool do_read_only, - bool highlight); - void builtin_images_load(); - /* Update tile manager to reflect resumable render settings. */ - void update_resumable_tile_manager(int num_samples); - /* Is used after each render layer synchronization is done with the goal * of freeing render engine data which is held from Blender side (for * example, dependency graph). */ void free_blender_memory_if_possible(); + + struct { + thread_mutex mutex; + int last_pass_index = -1; + } draw_state_; + + vector<string> full_buffer_files_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index de7b2761d00..8c4f789ffd0 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -17,6 +17,7 @@ #include "render/background.h" #include "render/colorspace.h" #include "render/graph.h" +#include "render/integrator.h" #include "render/light.h" #include "render/nodes.h" #include "render/osl.h" @@ -475,17 +476,11 @@ static ShaderNode *add_node(Scene *scene, SubsurfaceScatteringNode *subsurface = graph->create_node<SubsurfaceScatteringNode>(); switch (b_subsurface_node.falloff()) { - case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC: - subsurface->set_falloff(CLOSURE_BSSRDF_CUBIC_ID); - break; - case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN: - subsurface->set_falloff(CLOSURE_BSSRDF_GAUSSIAN_ID); - break; - case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY: - subsurface->set_falloff(CLOSURE_BSSRDF_BURLEY_ID); + case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK_FIXED_RADIUS: + subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID); break; case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK: - subsurface->set_falloff(CLOSURE_BSSRDF_RANDOM_WALK_ID); + subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_ID); break; } @@ -597,11 +592,11 @@ static ShaderNode *add_node(Scene *scene, break; } switch (b_principled_node.subsurface_method()) { - case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY: - principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_ID); + case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK_FIXED_RADIUS: + principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID); break; case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK: - principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID); + principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_ID); break; } node = principled; @@ -1360,10 +1355,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all) void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all) { Background *background = scene->background; + Integrator *integrator = scene->integrator; BL::World b_world = b_scene.world(); - BlenderViewportParameters new_viewport_parameters(b_v3d); + BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui); if (world_recalc || update_all || b_world.ptr.data != world_map || viewport_parameters.shader_modified(new_viewport_parameters)) { @@ -1455,9 +1451,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, /* AO */ BL::WorldLighting b_light = b_world.light_settings(); - background->set_use_ao(b_light.use_ambient_occlusion()); - background->set_ao_factor(b_light.ao_factor()); - background->set_ao_distance(b_light.distance()); + integrator->set_ao_factor(b_light.ao_factor()); + integrator->set_ao_distance(b_light.distance()); /* visibility */ PointerRNA cvisibility = RNA_pointer_get(&b_world.ptr, "cycles_visibility"); @@ -1472,9 +1467,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, background->set_visibility(visibility); } else { - background->set_use_ao(false); - background->set_ao_factor(0.0f); - background->set_ao_distance(FLT_MAX); + integrator->set_ao_factor(1.0f); + integrator->set_ao_distance(10.0f); } shader->set_graph(graph); @@ -1496,7 +1490,6 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, background->set_use_shader(view_layer.use_background_shader || viewport_parameters.use_custom_shader()); - background->set_use_ao(background->get_use_ao() && view_layer.use_background_ao); background->tag_update(scene); } diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 26d64b7bf85..d6fc7ee1723 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -53,6 +53,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine, BL::Scene &b_scene, Scene *scene, bool preview, + bool use_developer_ui, Progress &progress) : b_engine(b_engine), b_data(b_data), @@ -68,6 +69,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine, scene(scene), preview(preview), experimental(false), + use_developer_ui(use_developer_ui), dicing_rate(1.0f), max_subdivisions(12), progress(progress), @@ -224,7 +226,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d } if (b_v3d) { - BlenderViewportParameters new_viewport_parameters(b_v3d); + BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui); if (viewport_parameters.shader_modified(new_viewport_parameters)) { world_recalc = true; @@ -251,9 +253,13 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render, BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval(); + /* TODO(sergey): This feels weak to pass view layer to the integrator, and even weaker to have an + * implicit check on whether it is a background render or not. What is the nicer thing here? */ + const bool background = !b_v3d; + sync_view_layer(b_view_layer); - sync_integrator(); - sync_film(b_v3d); + sync_integrator(b_view_layer, background); + sync_film(b_view_layer, b_v3d); sync_shaders(b_depsgraph, b_v3d); sync_images(); @@ -280,7 +286,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render, /* Integrator */ -void BlenderSync::sync_integrator() +void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background) { PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); @@ -328,59 +334,24 @@ void BlenderSync::sync_integrator() integrator->set_motion_blur(view_layer.use_motion_blur); } - integrator->set_method((Integrator::Method)get_enum( - cscene, "progressive", Integrator::NUM_METHODS, Integrator::PATH)); - - integrator->set_sample_all_lights_direct(get_boolean(cscene, "sample_all_lights_direct")); - integrator->set_sample_all_lights_indirect(get_boolean(cscene, "sample_all_lights_indirect")); integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold")); SamplingPattern sampling_pattern = (SamplingPattern)get_enum( cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_SOBOL); - - int adaptive_min_samples = INT_MAX; - - if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) { - sampling_pattern = SAMPLING_PATTERN_PMJ; - adaptive_min_samples = get_int(cscene, "adaptive_min_samples"); - integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold")); - } - else { - integrator->set_adaptive_threshold(0.0f); - } - integrator->set_sampling_pattern(sampling_pattern); - int diffuse_samples = get_int(cscene, "diffuse_samples"); - int glossy_samples = get_int(cscene, "glossy_samples"); - int transmission_samples = get_int(cscene, "transmission_samples"); - int ao_samples = get_int(cscene, "ao_samples"); - int mesh_light_samples = get_int(cscene, "mesh_light_samples"); - int subsurface_samples = get_int(cscene, "subsurface_samples"); - int volume_samples = get_int(cscene, "volume_samples"); - - if (get_boolean(cscene, "use_square_samples")) { - integrator->set_diffuse_samples(diffuse_samples * diffuse_samples); - integrator->set_glossy_samples(glossy_samples * glossy_samples); - integrator->set_transmission_samples(transmission_samples * transmission_samples); - integrator->set_ao_samples(ao_samples * ao_samples); - integrator->set_mesh_light_samples(mesh_light_samples * mesh_light_samples); - integrator->set_subsurface_samples(subsurface_samples * subsurface_samples); - integrator->set_volume_samples(volume_samples * volume_samples); - adaptive_min_samples = min(adaptive_min_samples * adaptive_min_samples, INT_MAX); + if (preview) { + integrator->set_use_adaptive_sampling( + RNA_boolean_get(&cscene, "use_preview_adaptive_sampling")); + integrator->set_adaptive_threshold(get_float(cscene, "preview_adaptive_threshold")); + integrator->set_adaptive_min_samples(get_int(cscene, "preview_adaptive_min_samples")); } else { - integrator->set_diffuse_samples(diffuse_samples); - integrator->set_glossy_samples(glossy_samples); - integrator->set_transmission_samples(transmission_samples); - integrator->set_ao_samples(ao_samples); - integrator->set_mesh_light_samples(mesh_light_samples); - integrator->set_subsurface_samples(subsurface_samples); - integrator->set_volume_samples(volume_samples); + integrator->set_use_adaptive_sampling(RNA_boolean_get(&cscene, "use_adaptive_sampling")); + integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold")); + integrator->set_adaptive_min_samples(get_int(cscene, "adaptive_min_samples")); } - integrator->set_adaptive_min_samples(adaptive_min_samples); - if (get_boolean(cscene, "use_fast_gi")) { if (preview) { integrator->set_ao_bounces(get_int(cscene, "ao_bounces")); @@ -393,20 +364,38 @@ void BlenderSync::sync_integrator() integrator->set_ao_bounces(0); } - /* UPDATE_NONE as we don't want to tag the integrator as modified, just tag dependent things */ + const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background); + integrator->set_use_denoise(denoise_params.use); + + /* Only update denoiser parameters if the denoiser is actually used. This allows to tweak + * denoiser parameters before enabling it without render resetting on every change. The downside + * is that the interface and the integrator are technically out of sync. */ + if (denoise_params.use) { + integrator->set_denoiser_type(denoise_params.type); + integrator->set_denoise_start_sample(denoise_params.start_sample); + integrator->set_use_denoise_pass_albedo(denoise_params.use_pass_albedo); + integrator->set_use_denoise_pass_normal(denoise_params.use_pass_normal); + integrator->set_denoiser_prefilter(denoise_params.prefilter); + } + + /* UPDATE_NONE as we don't want to tag the integrator as modified (this was done by the + * set calls above), but we need to make sure that the dependent things are tagged. */ integrator->tag_update(scene, Integrator::UPDATE_NONE); } /* Film */ -void BlenderSync::sync_film(BL::SpaceView3D &b_v3d) +void BlenderSync::sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d) { PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); + PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); Film *film = scene->film; if (b_v3d) { - film->set_display_pass(update_viewport_display_passes(b_v3d, scene->passes)); + const BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui); + film->set_display_pass(new_viewport_parameters.display_pass); + film->set_show_active_pixels(new_viewport_parameters.show_active_pixels); } film->set_exposure(get_float(cscene, "film_exposure")); @@ -434,6 +423,15 @@ void BlenderSync::sync_film(BL::SpaceView3D &b_v3d) break; } } + + /* Blender viewport does not support proper shadow catcher compositing, so force an approximate + * mode to improve visual feedback. */ + if (b_v3d) { + film->set_use_approximate_shadow_catcher(true); + } + else { + film->set_use_approximate_shadow_catcher(!get_boolean(crl, "use_pass_shadow_catcher")); + } } /* Render Layer */ @@ -444,7 +442,6 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer) /* Filter. */ view_layer.use_background_shader = b_view_layer.use_sky(); - view_layer.use_background_ao = b_view_layer.use_ao(); /* Always enable surfaces for baking, otherwise there is nothing to bake to. */ view_layer.use_surfaces = b_view_layer.use_solid() || scene->bake_manager->get_baking(); view_layer.use_hair = b_view_layer.use_strand(); @@ -464,10 +461,7 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer) if (use_layer_samples != 2) { int samples = b_view_layer.samples(); - if (get_boolean(cscene, "use_square_samples")) - view_layer.samples = samples * samples; - else - view_layer.samples = samples; + view_layer.samples = samples; } } @@ -499,7 +493,8 @@ void BlenderSync::sync_images() } /* Passes */ -PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass) + +static PassType get_blender_pass_type(BL::RenderPass &b_pass) { string name = b_pass.name(); #define MAP_PASS(passname, passtype) \ @@ -507,10 +502,15 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass) return passtype; \ } \ ((void)0) + /* NOTE: Keep in sync with defined names from DNA_scene_types.h */ + MAP_PASS("Combined", PASS_COMBINED); + MAP_PASS("Noisy Image", PASS_COMBINED); + MAP_PASS("Depth", PASS_DEPTH); MAP_PASS("Mist", PASS_MIST); + MAP_PASS("Position", PASS_POSITION); MAP_PASS("Normal", PASS_NORMAL); MAP_PASS("IndexOB", PASS_OBJECT_ID); MAP_PASS("UV", PASS_UV); @@ -539,118 +539,92 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass) MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE); MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL); + MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL); + MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO); + + MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER); + MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER); + MAP_PASS("Debug Render Time", PASS_RENDER_TIME); + MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER); MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT); + if (string_startswith(name, cryptomatte_prefix)) { return PASS_CRYPTOMATTE; } + #undef MAP_PASS return PASS_NONE; } -int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass) +static Pass *pass_add(Scene *scene, + PassType type, + const char *name, + PassMode mode = PassMode::DENOISED) { - string name = b_pass.name(); + Pass *pass = scene->create_node<Pass>(); - if (name == "Noisy Image") - return DENOISING_PASS_PREFILTERED_COLOR; + pass->set_type(type); + pass->set_name(ustring(name)); + pass->set_mode(mode); - if (name.substr(0, 10) != "Denoising ") { - return -1; - } - name = name.substr(10); - -#define MAP_PASS(passname, offset) \ - if (name == passname) { \ - return offset; \ - } \ - ((void)0) - MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL); - MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO); - MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH); - MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING); - MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE); - MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY); - MAP_PASS("Clean", DENOISING_PASS_CLEAN); -#undef MAP_PASS - - return -1; + return pass; } -vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene, - BL::RenderLayer &b_rlay, - BL::ViewLayer &b_view_layer, - bool adaptive_sampling, - const DenoiseParams &denoising) +void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer) { - vector<Pass> passes; + PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); + + /* Delete all existing passes. */ + set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end()); + scene->delete_nodes(clear_passes); - /* loop over passes */ + /* Always add combined pass. */ + pass_add(scene, PASS_COMBINED, "Combined"); + + /* Blender built-in data and light passes. */ for (BL::RenderPass &b_pass : b_rlay.passes) { - PassType pass_type = get_pass_type(b_pass); + const PassType pass_type = get_blender_pass_type(b_pass); + + if (pass_type == PASS_NONE) { + LOG(ERROR) << "Unknown pass " << b_pass.name(); + continue; + } if (pass_type == PASS_MOTION && (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) { continue; } - if (pass_type != PASS_NONE) - Pass::add(pass_type, passes, b_pass.name().c_str()); - } - - PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - int denoising_flags = 0; - if (denoising.use || denoising.store_passes) { - if (denoising.type == DENOISER_NLM) { -#define MAP_OPTION(name, flag) \ - if (!get_boolean(crl, name)) { \ - denoising_flags |= flag; \ - } \ - ((void)0) - MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); - MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); - MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR); - MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND); - MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR); - MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND); -#undef MAP_OPTION - } - b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); + pass_add(scene, pass_type, b_pass.name().c_str()); } - scene->film->set_denoising_flags(denoising_flags); - - if (denoising.store_passes) { - b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str()); - if (denoising.type == DENOISER_NLM) { - b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str()); - b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str()); - } - if (scene->film->get_denoising_flags() & DENOISING_CLEAN_ALL_PASSES) { - b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str()); - } - } + PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); + /* Debug passes. */ if (get_boolean(crl, "pass_debug_render_time")) { b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str()); - Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time"); + pass_add(scene, PASS_RENDER_TIME, "Debug Render Time"); } if (get_boolean(crl, "pass_debug_sample_count")) { b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str()); - Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count"); + pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count"); } + + /* Cycles specific passes. */ if (get_boolean(crl, "use_pass_volume_direct")) { b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str()); - Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir"); + pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir"); } if (get_boolean(crl, "use_pass_volume_indirect")) { b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str()); - Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd"); + pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd"); + } + if (get_boolean(crl, "use_pass_shadow_catcher")) { + b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str()); + pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher"); } /* Cryptomatte stores two ID/weight pairs per RGBA layer. @@ -662,7 +636,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene, for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Object%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); - Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT); } @@ -670,7 +644,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene, for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Material%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); - Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL); } @@ -678,22 +652,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene, for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Asset%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); - Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + pass_add(scene, PASS_CRYPTOMATTE, passname.c_str()); } cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET); } - if (b_view_layer.use_pass_cryptomatte_accurate() && cryptomatte_passes != CRYPT_NONE) { - cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ACCURATE); - } scene->film->set_cryptomatte_passes(cryptomatte_passes); - if (adaptive_sampling) { - Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes); - if (!get_boolean(crl, "pass_debug_sample_count")) { - Pass::add(PASS_SAMPLE_COUNT, passes); + /* Denoising passes. */ + const bool use_denoising = get_boolean(cscene, "use_denoising") && + get_boolean(crl, "use_denoising"); + const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes"); + if (use_denoising) { + b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); + pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY); + if (get_boolean(crl, "use_pass_shadow_catcher")) { + b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str()); + pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY); } } + if (store_denoising_passes) { + b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); + pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY); + + b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); + pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY); + } + /* Custom AOV passes. */ BL::ViewLayer::aovs_iterator b_aov_iter; for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) { BL::AOV b_aov(*b_aov_iter); @@ -706,28 +691,15 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene, if (is_color) { b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str()); - Pass::add(PASS_AOV_COLOR, passes, name.c_str()); + pass_add(scene, PASS_AOV_COLOR, name.c_str()); } else { b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str()); - Pass::add(PASS_AOV_VALUE, passes, name.c_str()); + pass_add(scene, PASS_AOV_VALUE, name.c_str()); } } - scene->film->set_denoising_data_pass(denoising.use || denoising.store_passes); - scene->film->set_denoising_clean_pass(scene->film->get_denoising_flags() & - DENOISING_CLEAN_ALL_PASSES); - scene->film->set_denoising_prefiltered_pass(denoising.store_passes && - denoising.type == DENOISER_NLM); scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold()); - - if (!Pass::equals(passes, scene->passes)) { - scene->film->tag_passes_update(scene, passes); - scene->film->tag_modified(); - scene->integrator->tag_update(scene, Integrator::UPDATE_ALL); - } - - return passes; } void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph) @@ -773,9 +745,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background) params.shadingsystem = SHADINGSYSTEM_OSL; if (background || DebugFlags().viewport_static_bvh) - params.bvh_type = SceneParams::BVH_STATIC; + params.bvh_type = BVH_TYPE_STATIC; else - params.bvh_type = SceneParams::BVH_DYNAMIC; + params.bvh_type = BVH_TYPE_DYNAMIC; params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits"); params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh"); @@ -818,8 +790,7 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background) SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, BL::Preferences &b_preferences, BL::Scene &b_scene, - bool background, - BL::ViewLayer b_view_layer) + bool background) { SessionParams params; PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); @@ -827,7 +798,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* feature set */ params.experimental = (get_enum(cscene, "feature_set") != 0); - /* Background */ + /* Headless and background rendering. */ + params.headless = BlenderSession::headless; params.background = background; /* Device */ @@ -836,111 +808,26 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* samples */ int samples = get_int(cscene, "samples"); - int aa_samples = get_int(cscene, "aa_samples"); int preview_samples = get_int(cscene, "preview_samples"); - int preview_aa_samples = get_int(cscene, "preview_aa_samples"); - if (get_boolean(cscene, "use_square_samples")) { - aa_samples = aa_samples * aa_samples; - preview_aa_samples = preview_aa_samples * preview_aa_samples; - - samples = samples * samples; - preview_samples = preview_samples * preview_samples; - } - - if (get_enum(cscene, "progressive") == 0 && params.device.has_branched_path) { - if (background) { - params.samples = aa_samples; - } - else { - params.samples = preview_aa_samples; - if (params.samples == 0) - params.samples = INT_MAX; - } + if (background) { + params.samples = samples; } else { - if (background) { - params.samples = samples; - } - else { - params.samples = preview_samples; - if (params.samples == 0) - params.samples = INT_MAX; - } + params.samples = preview_samples; + if (params.samples == 0) + params.samples = INT_MAX; } /* Clamp samples. */ params.samples = min(params.samples, Integrator::MAX_SAMPLES); - /* Adaptive sampling. */ - params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling"); - - /* tiles */ - const bool is_cpu = (params.device.type == DEVICE_CPU); - if (!is_cpu && !background) { - /* currently GPU could be much slower than CPU when using tiles, - * still need to be investigated, but meanwhile make it possible - * to work in viewport smoothly - */ - int debug_tile_size = get_int(cscene, "debug_tile_size"); - - params.tile_size = make_int2(debug_tile_size, debug_tile_size); - } - else { - int tile_x = b_engine.tile_x(); - int tile_y = b_engine.tile_y(); - - params.tile_size = make_int2(tile_x, tile_y); - } - - if ((BlenderSession::headless == false) && background) { - params.tile_order = (TileOrder)get_enum(cscene, "tile_order"); - } - else { - params.tile_order = TILE_BOTTOM_TO_TOP; - } - - /* Denoising */ - params.denoising = get_denoise_params(b_scene, b_view_layer, background); - - if (params.denoising.use) { - /* Add additional denoising devices if we are rendering and denoising - * with different devices. */ - params.device.add_denoising_devices(params.denoising.type); - - /* Check if denoiser is supported by device. */ - if (!(params.device.denoisers & params.denoising.type)) { - params.denoising.use = false; - } - } - /* Viewport Performance */ - params.start_resolution = get_int(cscene, "preview_start_resolution"); params.pixel_size = b_engine.get_preview_pixel_size(b_scene); - /* other parameters */ - params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout"); - params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout"); - params.text_timeout = (double)get_float(cscene, "debug_text_timeout"); - - /* progressive refine */ - BL::RenderSettings b_r = b_scene.render(); - params.progressive_refine = b_engine.is_preview() || - get_boolean(cscene, "use_progressive_refine"); - if (b_r.use_save_buffers() || params.adaptive_sampling) - params.progressive_refine = false; - if (background) { - if (params.progressive_refine) - params.progressive = true; - else - params.progressive = false; - - params.start_resolution = INT_MAX; params.pixel_size = 1; } - else - params.progressive = true; /* shading system - scene level needs full refresh */ const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system"); @@ -950,19 +837,30 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, else if (shadingsystem == 1) params.shadingsystem = SHADINGSYSTEM_OSL; - /* Color management. */ - params.display_buffer_linear = b_engine.support_display_space_shader(b_scene); - - if (b_engine.is_preview()) { - /* For preview rendering we're using same timeout as - * blender's job update. - */ - params.progressive_update_timeout = 0.1; + /* Time limit. */ + if (background) { + params.time_limit = get_float(cscene, "time_limit"); + } + else { + /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is + * usually higher than acceptable level for the final frame. */ + /* TODO: It might be useful to support time limit in the viewport as well, but needs some + * extra thoughts and input. */ + params.time_limit = 0.0; } + /* Profiling. */ params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background && BlenderSession::print_render_stats; + if (background) { + params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile"); + params.tile_size = get_int(cscene, "tile_size"); + } + else { + params.use_auto_tile = false; + } + return params; } @@ -970,33 +868,34 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene, BL::ViewLayer &b_view_layer, bool background) { + enum DenoiserInput { + DENOISER_INPUT_RGB = 1, + DENOISER_INPUT_RGB_ALBEDO = 2, + DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3, + + DENOISER_INPUT_NUM, + }; + DenoiseParams denoising; PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); + int input_passes = -1; + if (background) { /* Final Render Denoising */ denoising.use = get_boolean(cscene, "use_denoising"); denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE); + denoising.prefilter = (DenoiserPrefilter)get_enum( + cscene, "denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_NONE); + + input_passes = (DenoiserInput)get_enum( + cscene, "denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO_NORMAL); if (b_view_layer) { PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles"); if (!get_boolean(clayer, "use_denoising")) { denoising.use = false; } - - denoising.radius = get_int(clayer, "denoising_radius"); - denoising.strength = get_float(clayer, "denoising_strength"); - denoising.feature_strength = get_float(clayer, "denoising_feature_strength"); - denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca"); - - denoising.input_passes = (DenoiserInput)get_enum( - clayer, - (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" : - "denoising_openimagedenoise_input_passes", - DENOISER_INPUT_NUM, - DENOISER_INPUT_RGB_ALBEDO_NORMAL); - - denoising.store_passes = get_boolean(clayer, "denoising_store_passes"); } } else { @@ -1004,10 +903,12 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene, denoising.use = get_boolean(cscene, "use_preview_denoising"); denoising.type = (DenoiserType)get_enum( cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE); + denoising.prefilter = (DenoiserPrefilter)get_enum( + cscene, "preview_denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_FAST); denoising.start_sample = get_int(cscene, "preview_denoising_start_sample"); - denoising.input_passes = (DenoiserInput)get_enum( - cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, (int)denoising.input_passes); + input_passes = (DenoiserInput)get_enum( + cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO); /* Auto select fastest denoiser. */ if (denoising.type == DENOISER_NONE) { @@ -1023,6 +924,27 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene, } } + switch (input_passes) { + case DENOISER_INPUT_RGB: + denoising.use_pass_albedo = false; + denoising.use_pass_normal = false; + break; + + case DENOISER_INPUT_RGB_ALBEDO: + denoising.use_pass_albedo = true; + denoising.use_pass_normal = false; + break; + + case DENOISER_INPUT_RGB_ALBEDO_NORMAL: + denoising.use_pass_albedo = true; + denoising.use_pass_normal = true; + break; + + default: + LOG(ERROR) << "Unhandled input passes enum " << input_passes; + break; + } + return denoising; } diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index d25c0ce1bc3..786479ac0f8 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -60,6 +60,7 @@ class BlenderSync { BL::Scene &b_scene, Scene *scene, bool preview, + bool use_developer_ui, Progress &progress); ~BlenderSync(); @@ -75,12 +76,8 @@ class BlenderSync { int height, void **python_thread_state); void sync_view_layer(BL::ViewLayer &b_view_layer); - vector<Pass> sync_render_passes(BL::Scene &b_scene, - BL::RenderLayer &b_render_layer, - BL::ViewLayer &b_view_layer, - bool adaptive_sampling, - const DenoiseParams &denoising); - void sync_integrator(); + void sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer); + void sync_integrator(BL::ViewLayer &b_view_layer, bool background); void sync_camera(BL::RenderSettings &b_render, BL::Object &b_override, int width, @@ -98,22 +95,13 @@ class BlenderSync { /* get parameters */ static SceneParams get_scene_params(BL::Scene &b_scene, bool background); - static SessionParams get_session_params( - BL::RenderEngine &b_engine, - BL::Preferences &b_userpref, - BL::Scene &b_scene, - bool background, - BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL)); + static SessionParams get_session_params(BL::RenderEngine &b_engine, + BL::Preferences &b_userpref, + BL::Scene &b_scene, + bool background); static bool get_session_pause(BL::Scene &b_scene, bool background); - static BufferParams get_buffer_params(BL::SpaceView3D &b_v3d, - BL::RegionView3D &b_rv3d, - Camera *cam, - int width, - int height, - const bool use_denoiser); - - static PassType get_pass_type(BL::RenderPass &b_pass); - static int get_denoising_pass(BL::RenderPass &b_pass); + static BufferParams get_buffer_params( + BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height); private: static DenoiseParams get_denoise_params(BL::Scene &b_scene, @@ -131,7 +119,7 @@ class BlenderSync { int width, int height, void **python_thread_state); - void sync_film(BL::SpaceView3D &b_v3d); + void sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d); void sync_view(); /* Shader */ @@ -245,6 +233,7 @@ class BlenderSync { Scene *scene; bool preview; bool experimental; + bool use_developer_ui; float dicing_rate; int max_subdivisions; @@ -253,7 +242,6 @@ class BlenderSync { RenderLayerInfo() : material_override(PointerRNA_NULL), use_background_shader(true), - use_background_ao(true), use_surfaces(true), use_hair(true), use_volumes(true), @@ -266,7 +254,6 @@ class BlenderSync { string name; BL::Material material_override; bool use_background_shader; - bool use_background_ao; bool use_surfaces; bool use_hair; bool use_volumes; diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp index 18bdfc74de0..62e32240bba 100644 --- a/intern/cycles/blender/blender_viewport.cpp +++ b/intern/cycles/blender/blender_viewport.cpp @@ -17,6 +17,8 @@ #include "blender_viewport.h" #include "blender_util.h" +#include "render/pass.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -26,11 +28,12 @@ BlenderViewportParameters::BlenderViewportParameters() studiolight_rotate_z(0.0f), studiolight_intensity(1.0f), studiolight_background_alpha(1.0f), - display_pass(PASS_COMBINED) + display_pass(PASS_COMBINED), + show_active_pixels(false) { } -BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d) +BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui) : BlenderViewportParameters() { if (!b_v3d) { @@ -55,7 +58,25 @@ BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d) } /* Film. */ - display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1); + + /* Lookup display pass based on the enum identifier. + * This is because integer values of python enum are not aligned with the passes definition in + * the kernel. */ + + display_pass = PASS_COMBINED; + + const string display_pass_identifier = get_enum_identifier(cshading, "render_pass"); + if (!display_pass_identifier.empty()) { + const ustring pass_type_identifier(string_to_lower(display_pass_identifier)); + const NodeEnum *pass_type_enum = Pass::get_type_enum(); + if (pass_type_enum->exists(pass_type_identifier)) { + display_pass = static_cast<PassType>((*pass_type_enum)[pass_type_identifier]); + } + } + + if (use_developer_ui) { + show_active_pixels = get_boolean(cshading, "show_active_pixels"); + } } bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters &other) const @@ -69,7 +90,7 @@ bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters bool BlenderViewportParameters::film_modified(const BlenderViewportParameters &other) const { - return display_pass != other.display_pass; + return display_pass != other.display_pass || show_active_pixels != other.show_active_pixels; } bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const @@ -82,18 +103,4 @@ bool BlenderViewportParameters::use_custom_shader() const return !(use_scene_world && use_scene_lights); } -PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes) -{ - if (b_v3d) { - const BlenderViewportParameters viewport_parameters(b_v3d); - const PassType display_pass = viewport_parameters.display_pass; - - passes.clear(); - Pass::add(display_pass, passes); - - return display_pass; - } - return PASS_NONE; -} - CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h index d6518597053..b5adafc30c9 100644 --- a/intern/cycles/blender/blender_viewport.h +++ b/intern/cycles/blender/blender_viewport.h @@ -39,9 +39,10 @@ class BlenderViewportParameters { /* Film. */ PassType display_pass; + bool show_active_pixels; BlenderViewportParameters(); - explicit BlenderViewportParameters(BL::SpaceView3D &b_v3d); + BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui); /* Check whether any of shading related settings are different from the given parameters. */ bool shader_modified(const BlenderViewportParameters &other) const; @@ -57,8 +58,6 @@ class BlenderViewportParameters { bool use_custom_shader() const; }; -PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes); - CCL_NAMESPACE_END #endif diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index 048c2b95e40..d3497f3a8d8 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -832,18 +832,18 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer typedef StackAllocator<256, float2> LeafTimeStackAllocator; typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator; - vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL]; - vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL]; - vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL]; - vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL]; - vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL]; + vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM]; + vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM]; + vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM]; + vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM]; + vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM]; /* TODO(sergey): In theory we should be able to store references. */ vector<BVHReference, LeafReferenceStackAllocator> object_references; - uint visibility[PRIMITIVE_NUM_TOTAL] = {0}; + uint visibility[PRIMITIVE_NUM] = {0}; /* NOTE: Keep initialization in sync with actual number of primitives. */ - BoundBox bounds[PRIMITIVE_NUM_TOTAL] = { + BoundBox bounds[PRIMITIVE_NUM] = { BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty}; int ob_num = 0; int num_new_prims = 0; @@ -877,7 +877,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer * TODO(sergey): With some pointer trickery we can write directly to the * destination buffers for the non-spatial split BVH. */ - BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL}; + BVHNode *leaves[PRIMITIVE_NUM + 1] = {NULL}; int num_leaves = 0; size_t start_index = 0; vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object; @@ -888,7 +888,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer if (need_prim_time) { local_prim_time.resize(num_new_prims); } - for (int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) { + for (int i = 0; i < PRIMITIVE_NUM; ++i) { int num = (int)p_type[i].size(); if (num != 0) { assert(p_type[i].size() == p_index[i].size()); diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp index 62f543941a9..96852510b63 100644 --- a/intern/cycles/bvh/bvh_embree.cpp +++ b/intern/cycles/bvh/bvh_embree.cpp @@ -37,10 +37,10 @@ /* Kernel includes are necessary so that the filter function for Embree can access the packed BVH. */ # include "kernel/bvh/bvh_embree.h" -# include "kernel/kernel_compat_cpu.h" -# include "kernel/kernel_globals.h" +# include "kernel/bvh/bvh_util.h" +# include "kernel/device/cpu/compat.h" +# include "kernel/device/cpu/globals.h" # include "kernel/kernel_random.h" -# include "kernel/split/kernel_split_data_types.h" # include "render/hair.h" # include "render/mesh.h" @@ -73,46 +73,69 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) const RTCRay *ray = (RTCRay *)args->ray; RTCHit *hit = (RTCHit *)args->hit; CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; - KernelGlobals *kg = ctx->kg; + const KernelGlobals *kg = ctx->kg; switch (ctx->type) { case CCLIntersectContext::RAY_SHADOW_ALL: { - /* Append the intersection to the end of the array. */ - if (ctx->num_hits < ctx->max_hits) { - Intersection current_isect; - kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); - for (size_t i = 0; i < ctx->max_hits; ++i) { + Intersection current_isect; + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + + /* If no transparent shadows, all light is blocked. */ + const int flags = intersection_get_shader_flags(kg, ¤t_isect); + if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->max_hits == 0) { + ctx->opaque_hit = true; + return; + } + + /* Test if we need to record this transparent intersection. */ + if (ctx->num_hits < ctx->max_hits || ray->tfar < ctx->max_t) { + /* Skip already recorded intersections. */ + int num_recorded_hits = min(ctx->num_hits, ctx->max_hits); + + for (int i = 0; i < num_recorded_hits; ++i) { if (current_isect.object == ctx->isect_s[i].object && current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) { /* This intersection was already recorded, skip it. */ *args->valid = 0; - break; + return; } } - Intersection *isect = &ctx->isect_s[ctx->num_hits]; - ++ctx->num_hits; - *isect = current_isect; - int prim = kernel_tex_fetch(__prim_index, isect->prim); - int shader = 0; - if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) { - shader = kernel_tex_fetch(__tri_shader, prim); - } - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } - int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags; - /* If no transparent shadows, all light is blocked. */ - if (flag & (SD_HAS_TRANSPARENT_SHADOW)) { - /* This tells Embree to continue tracing. */ - *args->valid = 0; + + /* If maximum number of hits was reached, replace the intersection with the + * highest distance. We want to find the N closest intersections. */ + int isect_index = num_recorded_hits; + if (num_recorded_hits + 1 >= ctx->max_hits) { + float max_t = ctx->isect_s[0].t; + int max_recorded_hit = 0; + + for (int i = 1; i < num_recorded_hits; ++i) { + if (ctx->isect_s[i].t > max_t) { + max_recorded_hit = i; + max_t = ctx->isect_s[i].t; + } + } + + if (num_recorded_hits >= ctx->max_hits) { + isect_index = max_recorded_hit; + } + + /* Limit the ray distance and stop counting hits beyond this. + * TODO: is there some way we can tell Embree to stop intersecting beyond + * this distance when max number of hits is reached?. Or maybe it will + * become irrelevant if we make max_hits a very high number on the CPU. */ + ctx->max_t = max(current_isect.t, max_t); } + + ctx->isect_s[isect_index] = current_isect; } - else { - /* Increase the number of hits beyond ray.max_hits - * so that the caller can detect this as opaque. */ - ++ctx->num_hits; - } + + /* Always increase the number of hits, even beyond ray.max_hits so that + * the caller can detect this as and consider it opaque, or trace another + * ray. */ + ++ctx->num_hits; + + /* This tells Embree to continue tracing. */ + *args->valid = 0; break; } case CCLIntersectContext::RAY_LOCAL: @@ -329,7 +352,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_) scene = NULL; } - const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC; + const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC; scene = rtcNewScene(rtc_device); const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) | diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index 2dc10f30363..31b3971c110 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -31,6 +31,27 @@ CCL_NAMESPACE_BEGIN */ typedef KernelBVHLayout BVHLayout; +/* Type of BVH, in terms whether it is supported dynamic updates of meshes + * or whether modifying geometry requires full BVH rebuild. + */ +enum BVHType { + /* BVH supports dynamic updates of geometry. + * + * Faster for updating BVH tree when doing modifications in viewport, + * but slower for rendering. + */ + BVH_TYPE_DYNAMIC = 0, + /* BVH tree is calculated for specific scene, updates in geometry + * requires full tree rebuild. + * + * Slower to update BVH tree when modifying objects in viewport, also + * slower to build final BVH tree but gives best possible render speed. + */ + BVH_TYPE_STATIC = 1, + + BVH_NUM_TYPES, +}; + /* Names bitflag type to denote which BVH layouts are supported by * particular area. * diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 04ff598621a..da259171844 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -287,9 +287,6 @@ if(CYCLES_STANDALONE_REPOSITORY) endif() set(__boost_packages filesystem regex system thread date_time) - if(WITH_CYCLES_NETWORK) - list(APPEND __boost_packages serialization) - endif() if(WITH_CYCLES_OSL) list(APPEND __boost_packages wave) endif() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 928249931a3..d18f4360aef 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -36,49 +36,70 @@ endif() set(SRC device.cpp - device_cpu.cpp - device_cuda.cpp - device_denoising.cpp - device_dummy.cpp + device_denoise.cpp + device_graphics_interop.cpp + device_kernel.cpp device_memory.cpp - device_multi.cpp - device_opencl.cpp - device_optix.cpp - device_split_kernel.cpp - device_task.cpp + device_queue.cpp +) + +set(SRC_CPU + cpu/device.cpp + cpu/device.h + cpu/device_impl.cpp + cpu/device_impl.h + cpu/kernel.cpp + cpu/kernel.h + cpu/kernel_function.h + cpu/kernel_thread_globals.cpp + cpu/kernel_thread_globals.h ) set(SRC_CUDA - cuda/device_cuda.h - cuda/device_cuda_impl.cpp + cuda/device.cpp + cuda/device.h + cuda/device_impl.cpp + cuda/device_impl.h + cuda/graphics_interop.cpp + cuda/graphics_interop.h + cuda/kernel.cpp + cuda/kernel.h + cuda/queue.cpp + cuda/queue.h + cuda/util.cpp + cuda/util.h ) -set(SRC_OPENCL - opencl/device_opencl.h - opencl/device_opencl_impl.cpp - opencl/memory_manager.h - opencl/memory_manager.cpp - opencl/opencl_util.cpp +set(SRC_DUMMY + dummy/device.cpp + dummy/device.h ) -if(WITH_CYCLES_NETWORK) - list(APPEND SRC - device_network.cpp - ) -endif() +set(SRC_MULTI + multi/device.cpp + multi/device.h +) + +set(SRC_OPTIX + optix/device.cpp + optix/device.h + optix/device_impl.cpp + optix/device_impl.h + optix/queue.cpp + optix/queue.h + optix/util.h +) set(SRC_HEADERS device.h - device_denoising.h + device_denoise.h + device_graphics_interop.h device_memory.h - device_intern.h - device_network.h - device_split_kernel.h - device_task.h + device_kernel.h + device_queue.h ) set(LIB - cycles_render cycles_kernel cycles_util ${CYCLES_GL_LIBRARIES} @@ -95,15 +116,7 @@ else() endif() add_definitions(${GL_DEFINITIONS}) -if(WITH_CYCLES_NETWORK) - add_definitions(-DWITH_NETWORK) -endif() -if(WITH_CYCLES_DEVICE_OPENCL) - list(APPEND LIB - extern_clew - ) - add_definitions(-DWITH_OPENCL) -endif() + if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() @@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI) endif() if(WITH_OPENIMAGEDENOISE) - add_definitions(-DWITH_OPENIMAGEDENOISE) - add_definitions(-DOIDN_STATIC_LIB) - list(APPEND INC_SYS - ${OPENIMAGEDENOISE_INCLUDE_DIRS} - ) list(APPEND LIB ${OPENIMAGEDENOISE_LIBRARIES} - ${TBB_LIBRARIES} ) endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS}) +cycles_add_library(cycles_device "${LIB}" + ${SRC} + ${SRC_CPU} + ${SRC_CUDA} + ${SRC_DUMMY} + ${SRC_MULTI} + ${SRC_OPTIX} + ${SRC_HEADERS} +) + +source_group("cpu" FILES ${SRC_CPU}) +source_group("cuda" FILES ${SRC_CUDA}) +source_group("dummy" FILES ${SRC_DUMMY}) +source_group("multi" FILES ${SRC_MULTI}) +source_group("optix" FILES ${SRC_OPTIX}) +source_group("common" FILES ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp new file mode 100644 index 00000000000..68ca8e8bb22 --- /dev/null +++ b/intern/cycles/device/cpu/device.cpp @@ -0,0 +1,64 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/device.h" +#include "device/cpu/device_impl.h" + +/* Used for `info.denoisers`. */ +/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their + * own class. But until then keep API consistent with how it used to work before. */ +#include "util/util_openimagedenoise.h" + +CCL_NAMESPACE_BEGIN + +Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return new CPUDevice(info, stats, profiler); +} + +void device_cpu_info(vector<DeviceInfo> &devices) +{ + DeviceInfo info; + + info.type = DEVICE_CPU; + info.description = system_cpu_brand_string(); + info.id = "CPU"; + info.num = 0; + info.has_osl = true; + info.has_half_images = true; + info.has_nanovdb = true; + info.has_profiling = true; + if (openimagedenoise_supported()) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } + + devices.insert(devices.begin(), info); +} + +string device_cpu_capabilities() +{ + string capabilities = ""; + capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; + capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; + capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; + capabilities += system_cpu_support_avx() ? "AVX " : ""; + capabilities += system_cpu_support_avx2() ? "AVX2" : ""; + if (capabilities[capabilities.size() - 1] == ' ') + capabilities.resize(capabilities.size() - 1); + return capabilities; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/device/cpu/device.h index dcea2630aef..9cb2e80068d 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl +++ b/intern/cycles/device/cpu/device.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,22 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_buffer_update.h" +#pragma once -#define KERNEL_NAME buffer_update -#define LOCALS_TYPE unsigned int -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE +#include "util/util_string.h" +#include "util/util_vector.h" +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_cpu_info(vector<DeviceInfo> &devices); + +string device_cpu_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp new file mode 100644 index 00000000000..3b0db6bdd0e --- /dev/null +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -0,0 +1,481 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/device_impl.h" + +#include <stdlib.h> +#include <string.h> + +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +/* So no context pollution happens from indirectly included windows.h */ +# include "util/util_windows.h" +# include <OSL/oslexec.h> +#endif + +#ifdef WITH_EMBREE +# include <embree3/rtcore.h> +#endif + +#include "device/cpu/kernel.h" +#include "device/cpu/kernel_thread_globals.h" + +#include "device/device.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/device/cpu/kernel.h" +#include "kernel/kernel_types.h" + +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +#include "bvh/bvh_embree.h" + +#include "render/buffers.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_opengl.h" +#include "util/util_openimagedenoise.h" +#include "util/util_optimization.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_task.h" +#include "util/util_thread.h" + +CCL_NAMESPACE_BEGIN + +CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + /* Pick any kernel, all of them are supposed to have same level of microarchitecture + * optimization. */ + VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name() + << " kernels."; + + if (info.cpu_threads == 0) { + info.cpu_threads = TaskScheduler::num_threads(); + } + +#ifdef WITH_OSL + kernel_globals.osl = &osl_globals; +#endif +#ifdef WITH_EMBREE + embree_device = rtcNewDevice("verbose=0"); +#endif + need_texture_info = false; +} + +CPUDevice::~CPUDevice() +{ +#ifdef WITH_EMBREE + rtcReleaseDevice(embree_device); +#endif + + texture_info.free(); +} + +bool CPUDevice::show_samples() const +{ + return (info.cpu_threads == 1); +} + +BVHLayoutMask CPUDevice::get_bvh_layout_mask() const +{ + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; +#ifdef WITH_EMBREE + bvh_layout_mask |= BVH_LAYOUT_EMBREE; +#endif /* WITH_EMBREE */ + return bvh_layout_mask; +} + +bool CPUDevice::load_texture_info() +{ + if (!need_texture_info) { + return false; + } + + texture_info.copy_to_device(); + need_texture_info = false; + + return true; +} + +void CPUDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_DEVICE_ONLY) { + assert(!mem.host_pointer); + size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; + void *data = util_aligned_malloc(mem.memory_size(), alignment); + mem.device_pointer = (device_ptr)data; + } + else { + mem.device_pointer = (device_ptr)mem.host_pointer; + } + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } +} + +void CPUDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* copy is no-op */ + } +} + +void CPUDevice::mem_copy_from( + device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) +{ + /* no-op */ +} + +void CPUDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + memset((void *)mem.device_pointer, 0, mem.memory_size()); + } +} + +void CPUDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else if (mem.device_pointer) { + if (mem.type == MEM_DEVICE_ONLY) { + util_aligned_free((void *)mem.device_pointer); + } + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CPUDevice::const_copy_to(const char *name, void *host, size_t size) +{ +#if WITH_EMBREE + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + // Update scene handle (since it is different for each device on multi devices) + KernelData *const data = (KernelData *)host; + data->bvh.scene = embree_scene; + } +#endif + kernel_const_copy(&kernel_globals, name, host, size); +} + +void CPUDevice::global_alloc(device_memory &mem) +{ + VLOG(1) << "Global memory allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); +} + +void CPUDevice::global_free(device_memory &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +void CPUDevice::tex_alloc(device_texture &mem) +{ + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount of re-allocations. */ + texture_info.resize(slot + 128); + } + + texture_info[slot] = mem.info; + texture_info[slot].data = (uint64_t)mem.host_pointer; + need_texture_info = true; +} + +void CPUDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + need_texture_info = true; + } +} + +void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ +#ifdef WITH_EMBREE + if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { + BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh); + if (refit) { + bvh_embree->refit(progress); + } + else { + bvh_embree->build(progress, &stats, embree_device); + } + + if (bvh->params.top_level) { + embree_scene = bvh_embree->scene; + } + } + else +#endif + Device::build_bvh(bvh, progress, refit); +} + +#if 0 +void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) +{ + const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; + + scoped_timer timer(&tile.buffers->render_time); + + Coverage coverage(kg, tile); + if (use_coverage) { + coverage.init_path_trace(); + } + + float *render_buffer = (float *)tile.buffer; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + /* Needed for Embree. */ + SIMD_SET_FLUSH_TO_ZERO; + + for (int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + + if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { + tile.stealing_state = RenderTile::WAS_STOLEN; + break; + } + + if (tile.task == RenderTile::PATH_TRACE) { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + if (use_coverage) { + coverage.init_pixel(x, y); + } + kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + else { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + tile.sample = sample + 1; + + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { + const bool stop = adaptive_sampling_filter(kg, tile, sample); + if (stop) { + const int num_progress_samples = end_sample - sample; + tile.sample = end_sample; + task.update_progress(&tile, tile.w * tile.h * num_progress_samples); + break; + } + } + + task.update_progress(&tile, tile.w * tile.h); + } + if (use_coverage) { + coverage.finalize(); + } + + if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { + adaptive_sampling_post(tile, kg); + } +} + +void CPUDevice::thread_render(DeviceTask &task) +{ + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + return; + } + + /* allocate buffer for kernel globals */ + CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory()); + + profiler.add_state(&kg.profiler); + + /* NLM denoiser. */ + DenoisingTask *denoising = NULL; + + /* OpenImageDenoise: we can only denoise with one thread at a time, so to + * avoid waiting with mutex locks in the denoiser, we let only a single + * thread acquire denoising tiles. */ + uint tile_types = task.tile_types; + bool hold_denoise_lock = false; + if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + if (!oidn_task_lock.try_lock()) { + tile_types &= ~RenderTile::DENOISE; + hold_denoise_lock = true; + } + } + + RenderTile tile; + while (task.acquire_tile(this, tile, tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, &kg); + } + else if (tile.task == RenderTile::DENOISE) { + denoise_openimagedenoise(task, tile); + task.update_progress(&tile, tile.w * tile.h); + } + + task.release_tile(tile); + + if (TaskPool::canceled()) { + if (task.need_finish_queue == false) + break; + } + } + + if (hold_denoise_lock) { + oidn_task_lock.unlock(); + } + + profiler.remove_state(&kg.profiler); + + delete denoising; +} + +void CPUDevice::thread_denoise(DeviceTask &task) +{ + RenderTile tile; + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + denoise_openimagedenoise(task, tile); + + task.update_progress(&tile, tile.w * tile.h); +} +#endif + +const CPUKernels *CPUDevice::get_cpu_kernels() const +{ + return &kernels; +} + +void CPUDevice::get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) +{ + /* Ensure latest texture info is loaded into kernel globals before returning. */ + load_texture_info(); + + kernel_thread_globals.clear(); + void *osl_memory = get_cpu_osl_memory(); + for (int i = 0; i < info.cpu_threads; i++) { + kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler); + } +} + +void *CPUDevice::get_cpu_osl_memory() +{ +#ifdef WITH_OSL + return &osl_globals; +#else + return NULL; +#endif +} + +bool CPUDevice::load_kernels(const uint /*kernel_features*/) +{ + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h new file mode 100644 index 00000000000..7d222808652 --- /dev/null +++ b/intern/cycles/device/cpu/device_impl.h @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/* So ImathMath is included before our kernel_cpu_compat. */ +#ifdef WITH_OSL +/* So no context pollution happens from indirectly included windows.h */ +# include "util/util_windows.h" +# include <OSL/oslexec.h> +#endif + +#ifdef WITH_EMBREE +# include <embree3/rtcore.h> +#endif + +#include "device/cpu/kernel.h" +#include "device/device.h" +#include "device/device_memory.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/kernel.h" +#include "kernel/device/cpu/globals.h" + +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +class CPUDevice : public Device { + public: + KernelGlobals kernel_globals; + + device_vector<TextureInfo> texture_info; + bool need_texture_info; + +#ifdef WITH_OSL + OSLGlobals osl_globals; +#endif +#ifdef WITH_EMBREE + RTCScene embree_scene = NULL; + RTCDevice embree_device; +#endif + + CPUKernels kernels; + + CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_); + ~CPUDevice(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + /* Returns true if the texture info was copied to the device (meaning, some more + * re-initialization might be needed). */ + bool load_texture_info(); + + virtual void mem_alloc(device_memory &mem) override; + virtual void mem_copy_to(device_memory &mem) override; + virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + virtual void mem_zero(device_memory &mem) override; + virtual void mem_free(device_memory &mem) override; + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + void tex_free(device_texture &mem); + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + virtual const CPUKernels *get_cpu_kernels() const override; + virtual void get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) override; + virtual void *get_cpu_osl_memory() override; + + protected: + virtual bool load_kernels(uint /*kernel_features*/) override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp new file mode 100644 index 00000000000..0ab58ff8600 --- /dev/null +++ b/intern/cycles/device/cpu/kernel.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/kernel.h" + +#include "kernel/device/cpu/kernel.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) + +#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name)) + +CPUKernels::CPUKernels() + : /* Integrator. */ + REGISTER_KERNEL(integrator_init_from_camera), + REGISTER_KERNEL(integrator_init_from_bake), + REGISTER_KERNEL(integrator_intersect_closest), + REGISTER_KERNEL(integrator_intersect_shadow), + REGISTER_KERNEL(integrator_intersect_subsurface), + REGISTER_KERNEL(integrator_intersect_volume_stack), + REGISTER_KERNEL(integrator_shade_background), + REGISTER_KERNEL(integrator_shade_light), + REGISTER_KERNEL(integrator_shade_shadow), + REGISTER_KERNEL(integrator_shade_surface), + REGISTER_KERNEL(integrator_shade_volume), + REGISTER_KERNEL(integrator_megakernel), + /* Shader evaluation. */ + REGISTER_KERNEL(shader_eval_displace), + REGISTER_KERNEL(shader_eval_background), + /* Adaptive campling. */ + REGISTER_KERNEL(adaptive_sampling_convergence_check), + REGISTER_KERNEL(adaptive_sampling_filter_x), + REGISTER_KERNEL(adaptive_sampling_filter_y), + /* Cryptomatte. */ + REGISTER_KERNEL(cryptomatte_postprocess), + /* Bake. */ + REGISTER_KERNEL(bake) +{ +} + +#undef REGISTER_KERNEL +#undef KERNEL_FUNCTIONS + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h new file mode 100644 index 00000000000..54b18308544 --- /dev/null +++ b/intern/cycles/device/cpu/kernel.h @@ -0,0 +1,111 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/cpu/kernel_function.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +struct KernelGlobals; +struct IntegratorStateCPU; +struct TileInfo; + +class CPUKernels { + public: + /* Integrator. */ + + using IntegratorFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>; + using IntegratorShadeFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>; + using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg, + IntegratorStateCPU *state, + KernelWorkTile *tile, + ccl_global float *render_buffer)>; + + IntegratorInitFunction integrator_init_from_camera; + IntegratorInitFunction integrator_init_from_bake; + IntegratorFunction integrator_intersect_closest; + IntegratorFunction integrator_intersect_shadow; + IntegratorFunction integrator_intersect_subsurface; + IntegratorFunction integrator_intersect_volume_stack; + IntegratorShadeFunction integrator_shade_background; + IntegratorShadeFunction integrator_shade_light; + IntegratorShadeFunction integrator_shade_shadow; + IntegratorShadeFunction integrator_shade_surface; + IntegratorShadeFunction integrator_shade_volume; + IntegratorShadeFunction integrator_megakernel; + + /* Shader evaluation. */ + + using ShaderEvalFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>; + + ShaderEvalFunction shader_eval_displace; + ShaderEvalFunction shader_eval_background; + + /* Adaptive stopping. */ + + using AdaptiveSamplingConvergenceCheckFunction = + CPUKernelFunction<bool (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride)>; + + using AdaptiveSamplingFilterXFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride)>; + + using AdaptiveSamplingFilterYFunction = + CPUKernelFunction<void (*)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride)>; + + AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check; + + AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x; + AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y; + + /* Cryptomatte. */ + + using CryptomattePostprocessFunction = CPUKernelFunction<void (*)( + const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>; + + CryptomattePostprocessFunction cryptomatte_postprocess; + + /* Bake. */ + + CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake; + + CPUKernels(); +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h new file mode 100644 index 00000000000..aa18720cc24 --- /dev/null +++ b/intern/cycles/device/cpu/kernel_function.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_debug.h" +#include "util/util_system.h" + +CCL_NAMESPACE_BEGIN + +/* A wrapper around per-microarchitecture variant of a kernel function. + * + * Provides a function-call-like API which gets routed to the most suitable implementation. + * + * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */ +template<typename FunctionType> class CPUKernelFunction { + public: + CPUKernelFunction(FunctionType kernel_default, + FunctionType kernel_sse2, + FunctionType kernel_sse3, + FunctionType kernel_sse41, + FunctionType kernel_avx, + FunctionType kernel_avx2) + { + kernel_info_ = get_best_kernel_info( + kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2); + } + + template<typename... Args> inline auto operator()(Args... args) const + { + assert(kernel_info_.kernel); + + return kernel_info_.kernel(args...); + } + + const char *get_uarch_name() const + { + return kernel_info_.uarch_name; + } + + protected: + /* Helper class which allows to pass human-readable microarchitecture name together with function + * pointer. */ + class KernelInfo { + public: + KernelInfo() : KernelInfo("", nullptr) + { + } + + /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without + * memory allocation. */ + KernelInfo(const char *uarch_name, FunctionType kernel) + : uarch_name(uarch_name), kernel(kernel) + { + } + + const char *uarch_name; + FunctionType kernel; + }; + + KernelInfo get_best_kernel_info(FunctionType kernel_default, + FunctionType kernel_sse2, + FunctionType kernel_sse3, + FunctionType kernel_sse41, + FunctionType kernel_avx, + FunctionType kernel_avx2) + { + /* Silence warnings about unused variables when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + return KernelInfo("AVX2", kernel_avx2); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { + return KernelInfo("AVX", kernel_avx); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { + return KernelInfo("SSE4.1", kernel_sse41); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { + return KernelInfo("SSE3", kernel_sse3); + } +#endif + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + return KernelInfo("SSE2", kernel_sse2); + } +#endif + + return KernelInfo("default", kernel_default); + } + + KernelInfo kernel_info_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp new file mode 100644 index 00000000000..988b00cd1f0 --- /dev/null +++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp @@ -0,0 +1,85 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/cpu/kernel_thread_globals.h" + +// clang-format off +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" +// clang-format on + +#include "util/util_profiling.h" + +CCL_NAMESPACE_BEGIN + +CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals, + void *osl_globals_memory, + Profiler &cpu_profiler) + : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler) +{ + reset_runtime_memory(); + +#ifdef WITH_OSL + OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory)); +#else + (void)osl_globals_memory; +#endif +} + +CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept + : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_) +{ + other.reset_runtime_memory(); +} + +CPUKernelThreadGlobals::~CPUKernelThreadGlobals() +{ +#ifdef WITH_OSL + OSLShader::thread_free(this); +#endif +} + +CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other) +{ + if (this == &other) { + return *this; + } + + *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other); + + other.reset_runtime_memory(); + + return *this; +} + +void CPUKernelThreadGlobals::reset_runtime_memory() +{ +#ifdef WITH_OSL + osl = nullptr; +#endif +} + +void CPUKernelThreadGlobals::start_profiling() +{ + cpu_profiler_.add_state(&profiler); +} + +void CPUKernelThreadGlobals::stop_profiling() +{ + cpu_profiler_.remove_state(&profiler); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h new file mode 100644 index 00000000000..d005c3bb56c --- /dev/null +++ b/intern/cycles/device/cpu/kernel_thread_globals.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + +CCL_NAMESPACE_BEGIN + +class Profiler; + +/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource + * which is not thread-safe for access. Every worker thread which needs to operate on + * `KernelGlobals` needs to initialize its own copy of this object. + * + * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that + * there is no unnecessary data duplication happening when using this object. */ +class CPUKernelThreadGlobals : public KernelGlobals { + public: + /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building + * without OSL support. Will avoid need to those unnamed pointers and casts. */ + CPUKernelThreadGlobals(const KernelGlobals &kernel_globals, + void *osl_globals_memory, + Profiler &cpu_profiler); + + ~CPUKernelThreadGlobals(); + + CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete; + CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept; + + CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete; + CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other); + + void start_profiling(); + void stop_profiling(); + + protected: + void reset_runtime_memory(); + + Profiler &cpu_profiler_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp index 2e225ecfaf8..84becd6d081 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/cuda/device.cpp @@ -14,21 +14,25 @@ * limitations under the License. */ -#ifdef WITH_CUDA +#include "device/cuda/device.h" + +#include "util/util_logging.h" -# include "device/cuda/device_cuda.h" +#ifdef WITH_CUDA +# include "device/cuda/device_impl.h" # include "device/device.h" -# include "device/device_intern.h" -# include "util/util_logging.h" # include "util/util_string.h" # include "util/util_windows.h" +#endif /* WITH_CUDA */ CCL_NAMESPACE_BEGIN bool device_cuda_init() { -# ifdef WITH_CUDA_DYNLOAD +#if !defined(WITH_CUDA) + return false; +#elif defined(WITH_CUDA_DYNLOAD) static bool initialized = false; static bool result = false; @@ -59,16 +63,27 @@ bool device_cuda_init() } return result; -# else /* WITH_CUDA_DYNLOAD */ +#else /* WITH_CUDA_DYNLOAD */ return true; -# endif /* WITH_CUDA_DYNLOAD */ +#endif /* WITH_CUDA_DYNLOAD */ } -Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { - return new CUDADevice(info, stats, profiler, background); +#ifdef WITH_CUDA + return new CUDADevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen."; + + return nullptr; +#endif } +#ifdef WITH_CUDA static CUresult device_cuda_safe_init() { # ifdef _WIN32 @@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init() return cuInit(0); # endif } +#endif /* WITH_CUDA */ void device_cuda_info(vector<DeviceInfo> &devices) { +#ifdef WITH_CUDA CUresult result = device_cuda_safe_init(); if (result != CUDA_SUCCESS) { if (result != CUDA_ERROR_NO_DEVICE) @@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_half_images = (major >= 3); info.has_nanovdb = true; - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.denoisers = DENOISER_NLM; + info.denoisers = 0; + + info.has_gpu_queue = true; /* Check if the device has P2P access to any other device in the system. */ for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { @@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices) if (!display_devices.empty()) devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_CUDA */ + (void)devices; +#endif /* WITH_CUDA */ } string device_cuda_capabilities() { +#ifdef WITH_CUDA CUresult result = device_cuda_safe_init(); if (result != CUDA_SUCCESS) { if (result != CUDA_ERROR_NO_DEVICE) { @@ -310,8 +331,10 @@ string device_cuda_capabilities() } return capabilities; + +#else /* WITH_CUDA */ + return ""; +#endif /* WITH_CUDA */ } CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/device/cuda/device.h index e68d4104a91..b0484904d1a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl +++ b/intern/cycles/device/cuda/device.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,24 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_enqueue_inactive.h" +#pragma once -#define KERNEL_NAME enqueue_inactive -#define LOCALS_TYPE unsigned int -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE +#include "util/util_string.h" +#include "util/util_vector.h" +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_cuda_init(); + +Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_cuda_info(vector<DeviceInfo> &devices); + +string device_cuda_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h deleted file mode 100644 index c3271c3cfcf..00000000000 --- a/intern/cycles/device/cuda/device_cuda.h +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_CUDA - -# include "device/device.h" -# include "device/device_denoising.h" -# include "device/device_split_kernel.h" - -# include "util/util_map.h" -# include "util/util_task.h" - -# ifdef WITH_CUDA_DYNLOAD -# include "cuew.h" -# else -# include "util/util_opengl.h" -# include <cuda.h> -# include <cudaGL.h> -# endif - -CCL_NAMESPACE_BEGIN - -class CUDASplitKernel; - -class CUDADevice : public Device { - - friend class CUDASplitKernelFunction; - friend class CUDASplitKernel; - friend class CUDAContextScope; - - public: - DedicatedTaskPool task_pool; - CUdevice cuDevice; - CUcontext cuContext; - CUmodule cuModule, cuFilterModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; - int pitch_alignment; - int cuDevId; - int cuDevArchitecture; - bool first_error; - CUDASplitKernel *split_kernel; - - struct CUDAMem { - CUDAMem() : texobject(0), array(0), use_mapped_host(false) - { - } - - CUtexObject texobject; - CUarray array; - - /* If true, a mapped host memory in shared_pointer is being used. */ - bool use_mapped_host; - }; - typedef map<device_memory *, CUDAMem> CUDAMemMap; - CUDAMemMap cuda_mem_map; - thread_mutex cuda_mem_map_mutex; - - struct PixelMem { - GLuint cuPBO; - CUgraphicsResource cuPBOresource; - GLuint cuTexId; - int w, h; - }; - map<device_ptr, PixelMem> pixel_mem_map; - - /* Bindless Textures */ - device_vector<TextureInfo> texture_info; - bool need_texture_info; - - /* Kernels */ - struct { - bool loaded; - - CUfunction adaptive_stopping; - CUfunction adaptive_filter_x; - CUfunction adaptive_filter_y; - CUfunction adaptive_scale_samples; - int adaptive_num_threads_per_block; - } functions; - - static bool have_precompiled_kernels(); - - virtual bool show_samples() const override; - - virtual BVHLayoutMask get_bvh_layout_mask() const override; - - void set_error(const string &error) override; - - CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_); - - virtual ~CUDADevice(); - - bool support_device(const DeviceRequestedFeatures & /*requested_features*/); - - bool check_peer_access(Device *peer_device) override; - - bool use_adaptive_compilation(); - - bool use_split_kernel(); - - virtual string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false); - - string compile_kernel(const DeviceRequestedFeatures &requested_features, - const char *name, - const char *base = "cuda", - bool force_ptx = false); - - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override; - - void load_functions(); - - void reserve_local_memory(const DeviceRequestedFeatures &requested_features); - - void init_host_memory(); - - void load_texture_info(); - - void move_textures_to_host(size_t size, bool for_texture); - - CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); - - void generic_copy_to(device_memory &mem); - - void generic_free(device_memory &mem); - - void mem_alloc(device_memory &mem) override; - - void mem_copy_to(device_memory &mem) override; - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; - - void mem_zero(device_memory &mem) override; - - void mem_free(device_memory &mem) override; - - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; - - virtual void const_copy_to(const char *name, void *host, size_t size) override; - - void global_alloc(device_memory &mem); - - void global_free(device_memory &mem); - - void tex_alloc(device_texture &mem); - - void tex_free(device_texture &mem); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - - bool denoising_construct_transform(DenoisingTask *task); - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task); - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - void denoise(RenderTile &rtile, DenoisingTask &denoising); - - void adaptive_sampling_filter(uint filter_sample, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream = 0); - void adaptive_sampling_post(RenderTile &rtile, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream = 0); - - void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); - - void film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half); - - void shader(DeviceTask &task); - - CUdeviceptr map_pixels(device_ptr mem); - - void unmap_pixels(device_ptr mem); - - void pixels_alloc(device_memory &mem); - - void pixels_copy_from(device_memory &mem, int y, int w, int h); - - void pixels_free(device_memory &mem); - - void draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) override; - - void thread_run(DeviceTask &task); - - virtual void task_add(DeviceTask &task) override; - - virtual void task_wait() override; - - virtual void task_cancel() override; -}; - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp deleted file mode 100644 index 2d2fcb38705..00000000000 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ /dev/null @@ -1,2714 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_CUDA - -# include <climits> -# include <limits.h> -# include <stdio.h> -# include <stdlib.h> -# include <string.h> - -# include "device/cuda/device_cuda.h" -# include "device/device_intern.h" -# include "device/device_split_kernel.h" - -# include "render/buffers.h" - -# include "kernel/filter/filter_defines.h" - -# include "util/util_debug.h" -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_map.h" -# include "util/util_md5.h" -# include "util/util_opengl.h" -# include "util/util_path.h" -# include "util/util_string.h" -# include "util/util_system.h" -# include "util/util_time.h" -# include "util/util_types.h" -# include "util/util_windows.h" - -# include "kernel/split/kernel_split_data_types.h" - -CCL_NAMESPACE_BEGIN - -# ifndef WITH_CUDA_DYNLOAD - -/* Transparently implement some functions, so majority of the file does not need - * to worry about difference between dynamically loaded and linked CUDA at all. - */ - -namespace { - -const char *cuewErrorString(CUresult result) -{ - /* We can only give error code here without major code duplication, that - * should be enough since dynamic loading is only being disabled by folks - * who knows what they're doing anyway. - * - * NOTE: Avoid call from several threads. - */ - static string error; - error = string_printf("%d", result); - return error.c_str(); -} - -const char *cuewCompilerPath() -{ - return CYCLES_CUDA_NVCC_EXECUTABLE; -} - -int cuewCompilerVersion() -{ - return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); -} - -} /* namespace */ -# endif /* WITH_CUDA_DYNLOAD */ - -class CUDADevice; - -class CUDASplitKernel : public DeviceSplitKernel { - CUDADevice *device; - - public: - explicit CUDASplitKernel(CUDADevice *device); - - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs); - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); -}; - -/* Utility to push/pop CUDA context. */ -class CUDAContextScope { - public: - CUDAContextScope(CUDADevice *device); - ~CUDAContextScope(); - - private: - CUDADevice *device; -}; - -bool CUDADevice::have_precompiled_kernels() -{ - string cubins_path = path_get("lib"); - return path_exists(cubins_path); -} - -bool CUDADevice::show_samples() const -{ - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; -} - -BVHLayoutMask CUDADevice::get_bvh_layout_mask() const -{ - return BVH_LAYOUT_BVH2; -} - -void CUDADevice::set_error(const string &error) -{ - Device::set_error(error); - - if (first_error) { - fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, - "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); - first_error = false; - } -} - -# define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - if (result != CUDA_SUCCESS) { \ - const char *name = cuewErrorString(result); \ - set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ - } \ - } \ - (void)0 - -CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL) -{ - first_error = true; - background = background_; - - cuDevId = info.num; - cuDevice = 0; - cuContext = 0; - - cuModule = 0; - cuFilterModule = 0; - - split_kernel = NULL; - - need_texture_info = false; - - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; - pitch_alignment = 0; - - functions.loaded = false; - - /* Initialize CUDA. */ - CUresult result = cuInit(0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); - return; - } - - /* Setup device and context. */ - result = cuDeviceGet(&cuDevice, cuDevId); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", - cuewErrorString(result))); - return; - } - - /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. - * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, - * so we can predict which memory to map to host. */ - cuda_assert( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); - - cuda_assert(cuDeviceGetAttribute( - &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - - unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; - if (can_map_host) { - ctx_flags |= CU_CTX_MAP_HOST; - init_host_memory(); - } - - /* Create context. */ - if (background) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - } - else { - result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); - - if (result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - background = true; - } - } - - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); - return; - } - - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - cuDevArchitecture = major * 100 + minor * 10; - - /* Pop context set by cuCtxCreate. */ - cuCtxPopCurrent(NULL); -} - -CUDADevice::~CUDADevice() -{ - task_pool.cancel(); - - delete split_kernel; - - texture_info.free(); - - cuda_assert(cuCtxDestroy(cuContext)); -} - -bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/) -{ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* We only support sm_30 and above */ - if (major < 3) { - set_error(string_printf( - "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); - return false; - } - - return true; -} - -bool CUDADevice::check_peer_access(Device *peer_device) -{ - if (peer_device == this) { - return false; - } - if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { - return false; - } - - CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); - - int can_access = 0; - cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); - if (can_access == 0) { - return false; - } - - // Ensure array access over the link is possible as well (for 3D textures) - cuda_assert(cuDeviceGetP2PAttribute(&can_access, - CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED, - cuDevice, - peer_device_cuda->cuDevice)); - if (can_access == 0) { - return false; - } - - // Enable peer access in both directions - { - const CUDAContextScope scope(this); - CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to enable peer access on CUDA context (%s)", - cuewErrorString(result))); - return false; - } - } - { - const CUDAContextScope scope(peer_device_cuda); - CUresult result = cuCtxEnablePeerAccess(cuContext, 0); - if (result != CUDA_SUCCESS) { - set_error(string_printf("Failed to enable peer access on CUDA context (%s)", - cuewErrorString(result))); - return false; - } - } - - return true; -} - -bool CUDADevice::use_adaptive_compilation() -{ - return DebugFlags().cuda.adaptive_compile; -} - -bool CUDADevice::use_split_kernel() -{ - return DebugFlags().cuda.split_kernel; -} - -/* Common NVCC flags which stays the same regardless of shading model, - * kernel sources md5 and only depends on compiler or compilation settings. - */ -string CUDADevice::compile_kernel_get_common_cflags( - const DeviceRequestedFeatures &requested_features, bool filter, bool split) -{ - const int machine = system_cpu_bits(); - const string source_path = path_get("source"); - const string include_path = source_path; - string cflags = string_printf( - "-m%d " - "--ptxas-options=\"-v\" " - "--use_fast_math " - "-DNVCC " - "-I\"%s\"", - machine, - include_path.c_str()); - if (!filter && use_adaptive_compilation()) { - cflags += " " + requested_features.get_build_options(); - } - const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); - if (extra_cflags) { - cflags += string(" ") + string(extra_cflags); - } - - if (split) { - cflags += " -D__SPLIT__"; - } - -# ifdef WITH_NANOVDB - cflags += " -DWITH_NANOVDB"; -# endif - - return cflags; -} - -string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features, - const char *name, - const char *base, - bool force_ptx) -{ - /* Compute kernel name. */ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* Attempt to use kernel provided with Blender. */ - if (!use_adaptive_compilation()) { - if (!force_ptx) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; - return cubin; - } - } - - /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ - int ptx_major = major, ptx_minor = minor; - while (ptx_major >= 3) { - const string ptx = path_get( - string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; - if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; - return ptx; - } - - if (ptx_minor > 0) { - ptx_minor--; - } - else { - ptx_major--; - ptx_minor = 9; - } - } - } - - /* Try to use locally compiled kernel. */ - string source_path = path_get("source"); - const string source_md5 = path_files_md5_hash(source_path); - - /* We include cflags into md5 so changing cuda toolkit or changing other - * compiler command line arguments makes sure cubin gets re-built. - */ - string common_cflags = compile_kernel_get_common_cflags( - requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL); - const string kernel_md5 = util_md5_string(source_md5 + common_cflags); - - const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; - const char *const kernel_arch = force_ptx ? "compute" : "sm"; - const string cubin_file = string_printf( - "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); - const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; - return cubin; - } - -# ifdef _WIN32 - if (!use_adaptive_compilation() && have_precompiled_kernels()) { - if (major < 3) { - set_error( - string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. " - "Your GPU is not supported.", - major, - minor)); - } - else { - set_error( - string_printf("CUDA binary kernel for this graphics card compute " - "capability (%d.%d) not found.", - major, - minor)); - } - return string(); - } -# endif - - /* Compile. */ - const char *const nvcc = cuewCompilerPath(); - if (nvcc == NULL) { - set_error( - "CUDA nvcc compiler not found. " - "Install CUDA toolkit in default location."); - return string(); - } - - const int nvcc_cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; - if (nvcc_cuda_version < 101) { - printf( - "Unsupported CUDA version %d.%d detected, " - "you need CUDA 10.1 or newer.\n", - nvcc_cuda_version / 10, - nvcc_cuda_version % 10); - return string(); - } - else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 || - nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) { - printf( - "CUDA version %d.%d detected, build may succeed but only " - "CUDA 10.1 to 11.4 are officially supported.\n", - nvcc_cuda_version / 10, - nvcc_cuda_version % 10); - } - - double starttime = time_dt(); - - path_create_directories(cubin); - - source_path = path_join(path_join(source_path, "kernel"), - path_join("kernels", path_join(base, string_printf("%s.cu", name)))); - - string command = string_printf( - "\"%s\" " - "-arch=%s_%d%d " - "--%s \"%s\" " - "-o \"%s\" " - "%s", - nvcc, - kernel_arch, - major, - minor, - kernel_ext, - source_path.c_str(), - cubin.c_str(), - common_cflags.c_str()); - - printf("Compiling CUDA kernel ...\n%s\n", command.c_str()); - -# ifdef _WIN32 - command = "call " + command; -# endif - if (system(command.c_str()) != 0) { - set_error( - "Failed to execute compilation command, " - "see console for details."); - return string(); - } - - /* Verify if compilation succeeded */ - if (!path_exists(cubin)) { - set_error( - "CUDA kernel compilation failed, " - "see console for details."); - return string(); - } - - printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); - - return cubin; -} - -bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features) -{ - /* TODO(sergey): Support kernels re-load for CUDA devices. - * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. - */ - if (cuFilterModule && cuModule) { - VLOG(1) << "Skipping kernel reload, not currently supported."; - return true; - } - - /* check if cuda init succeeded */ - if (cuContext == 0) - return false; - - /* check if GPU is supported */ - if (!support_device(requested_features)) - return false; - - /* get kernel */ - const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel"; - string cubin = compile_kernel(requested_features, kernel_name); - if (cubin.empty()) - return false; - - const char *filter_name = "filter"; - string filter_cubin = compile_kernel(requested_features, filter_name); - if (filter_cubin.empty()) - return false; - - /* open module */ - CUDAContextScope scope(this); - - string cubin_data; - CUresult result; - - if (path_read_text(cubin, cubin_data)) - result = cuModuleLoadData(&cuModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (result != CUDA_SUCCESS) - set_error(string_printf( - "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result))); - - if (path_read_text(filter_cubin, cubin_data)) - result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (result != CUDA_SUCCESS) - set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)", - filter_cubin.c_str(), - cuewErrorString(result))); - - if (result == CUDA_SUCCESS) { - reserve_local_memory(requested_features); - } - - load_functions(); - - return (result == CUDA_SUCCESS); -} - -void CUDADevice::load_functions() -{ - /* TODO: load all functions here. */ - if (functions.loaded) { - return; - } - functions.loaded = true; - - cuda_assert(cuModuleGetFunction( - &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y")); - cuda_assert(cuModuleGetFunction( - &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples")); - - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1)); - - int unused_min_blocks; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks, - &functions.adaptive_num_threads_per_block, - functions.adaptive_scale_samples, - NULL, - 0, - 0)); -} - -void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features) -{ - if (use_split_kernel()) { - /* Split kernel mostly uses global memory and adaptive compilation, - * difficult to predict how much is needed currently. */ - return; - } - - /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory - * needed for kernel launches, so that we can reliably figure out when - * to allocate scene data in mapped host memory. */ - CUDAContextScope scope(this); - - size_t total = 0, free_before = 0, free_after = 0; - cuMemGetInfo(&free_before, &total); - - /* Get kernel function. */ - CUfunction cuRender; - - if (requested_features.use_baking) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else if (requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - - /* Launch kernel, using just 1 block appears sufficient to reserve - * memory for all multiprocessors. It would be good to do this in - * parallel for the multi GPU case still to make it faster. */ - CUdeviceptr d_work_tiles = 0; - uint total_work_size = 0; - - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - cuMemGetInfo(&free_after, &total); - VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) - << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; - -# if 0 - /* For testing mapped host memory, fill up device memory. */ - const size_t keep_mb = 1024; - - while (free_after > keep_mb * 1024 * 1024LL) { - CUdeviceptr tmp; - cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); - cuMemGetInfo(&free_after, &total); - } -# endif -} - -void CUDADevice::init_host_memory() -{ - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; -} - -void CUDADevice::load_texture_info() -{ - if (need_texture_info) { - /* Unset flag before copying, so this does not loop indefinitely if the copy below calls - * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ - need_texture_info = false; - texture_info.copy_to_device(); - } -} - -void CUDADevice::move_textures_to_host(size_t size, bool for_texture) -{ - /* Break out of recursive call, which can happen when moving memory on a multi device. */ - static bool any_device_moving_textures_to_host = false; - if (any_device_moving_textures_to_host) { - return; - } - - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - thread_scoped_lock lock(cuda_mem_map_mutex); - foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - /* Can only move textures allocated on this device (and not those from peer devices). - * And need to ignore memory that is already on the host. */ - if (!mem.is_resident(this) || cmem->use_mapped_host) { - continue; - } - - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && - (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - lock.unlock(); - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - any_device_moving_textures_to_host = true; - - /* Potentially need to call back into multi device, so pointer mapping - * and peer devices are updated. This is also necessary since the device - * pointer may just be a key here, so cannot be accessed and freed directly. - * Unfortunately it does mean that memory is reallocated on all other - * devices as well, which is potentially dangerous when still in use (since - * a thread rendering on another devices would only be caught in this mutex - * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); - size = (max_size >= size) ? 0 : size - max_size; - - any_device_moving_textures_to_host = false; - } - else { - break; - } - } - - /* Unset flag before texture info is reloaded, since it should stay in device memory. */ - move_texture_to_host = false; - - /* Update texture info array with new pointers. */ - load_texture_info(); -} - -CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) -{ - CUDAContextScope scope(this); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - } - - if (mem_alloc_result != CUDA_SUCCESS) { - if (mem.type == MEM_DEVICE_ONLY) { - status = " failed, out of device memory"; - set_error("System is out of GPU memory"); - } - else { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); - } - } - - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - CUDAMem *cmem = &cuda_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A Call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in OptiXDevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; -} - -void CUDADevice::generic_copy_to(device_memory &mem) -{ - if (!mem.host_pointer || !mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, the current device only uses device memory allocated by - * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from - * mem.host_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); - } -} - -void CUDADevice::generic_free(device_memory &mem) -{ - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ - - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - cuMemFreeHost(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuda_assert(cuMemFree(mem.device_pointer)); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } -} - -void CUDADevice::mem_alloc(device_memory &mem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else if (mem.type == MEM_GLOBAL) { - assert(!"mem_alloc not supported for global memory."); - } - else { - generic_alloc(mem); - } -} - -void CUDADevice::mem_copy_to(device_memory &mem) -{ - if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - generic_copy_to(mem); - } -} - -void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_copy_from(mem, y, w, h); - } - else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { - assert(!"mem_copy_from not supported for textures."); - } - else if (mem.host_pointer) { - const size_t size = elem * w * h; - const size_t offset = elem * y * w; - - if (mem.device_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemcpyDtoH( - (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); - } - else { - memset((char *)mem.host_pointer + offset, 0, size); - } - } -} - -void CUDADevice::mem_zero(device_memory &mem) -{ - if (!mem.device_pointer) { - mem_alloc(mem); - } - if (!mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory - * regardless of mem.host_pointer and mem.shared_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); - } - else if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } -} - -void CUDADevice::mem_free(device_memory &mem) -{ - if (mem.type == MEM_PIXELS && !background) { - pixels_free(mem); - } - else if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else { - generic_free(mem); - } -} - -device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) -{ - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); -} - -void CUDADevice::const_copy_to(const char *name, void *host, size_t size) -{ - CUDAContextScope scope(this); - CUdeviceptr mem; - size_t bytes; - - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - // assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); -} - -void CUDADevice::global_alloc(device_memory &mem) -{ - if (mem.is_resident(this)) { - generic_alloc(mem); - generic_copy_to(mem); - } - - const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); -} - -void CUDADevice::global_free(device_memory &mem) -{ - if (mem.is_resident(this) && mem.device_pointer) { - generic_free(mem); - } -} - -void CUDADevice::tex_alloc(device_texture &mem) -{ - CUDAContextScope scope(this); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch (mem.info.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if (mem.info.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch (mem.data_type) { - case TYPE_UCHAR: - format = CU_AD_FORMAT_UNSIGNED_INT8; - break; - case TYPE_UINT16: - format = CU_AD_FORMAT_UNSIGNED_INT16; - break; - case TYPE_UINT: - format = CU_AD_FORMAT_UNSIGNED_INT32; - break; - case TYPE_INT: - format = CU_AD_FORMAT_SIGNED_INT32; - break; - case TYPE_FLOAT: - format = CU_AD_FORMAT_FLOAT; - break; - case TYPE_HALF: - format = CU_AD_FORMAT_HALF; - break; - default: - assert(0); - return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if (!mem.is_resident(this)) { - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - - if (mem.data_depth > 1) { - array_3d = (CUarray)mem.device_pointer; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - dst_pitch = align_up(src_pitch, pitch_alignment); - } - } - else if (mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - cuda_assert(cuArray3DCreate(&array_3d, &desc)); - - if (!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - dst_pitch = align_up(src_pitch, pitch_alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if (!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if (!cmem) { - return; - } - - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Resize once */ - const uint slot = mem.slot; - if (slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - texture_info[slot] = mem.info; - need_texture_info = true; - - if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - /* Kepler+, bindless textures. */ - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if (array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if (mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; - - cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - texture_info[slot].data = (uint64_t)cmem->texobject; - } - else { - texture_info[slot].data = (uint64_t)mem.device_pointer; - } -} - -void CUDADevice::tex_free(device_texture &mem) -{ - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if (!mem.is_resident(this)) { - /* Do not free memory here, since it was allocated on a different device. */ - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else if (cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - lock.unlock(); - generic_free(mem); - } - } -} - -# define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int threads = (int)sqrt((float)threads_per_block); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; - -# define CUDA_LAUNCH_KERNEL(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); - -/* Similar as above, but for 1-dimensional blocks. */ -# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ - int yblocks = h; - -# define CUDA_LAUNCH_KERNEL_1D(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); - -bool CUDADevice::denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - int frame_offset = 0; - - if (have_error()) - return false; - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; - CUdeviceptr scale_ptr = 0; - - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction( - &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, - &variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &channel_offset, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, - &image_ptr, - &out_ptr, - &weightAccum, - &w, - &h, - &stride, - &pass_stride, - &channel_offset, - &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - cuda_assert( - cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); - } - - return !have_error(); -} - -bool CUDADevice::denoising_construct_transform(DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterConstructTransform; - cuda_assert(cuModuleGetFunction( - &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - if (have_error()) - return false; - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert( - cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &pass_stride, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, - &h, - &stride, - &pass_stride, - &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) -{ - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE( - cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterCombineHalves; - cuda_assert(cuModuleGetFunction( - &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDivideShadow; - cuda_assert(cuModuleGetFunction( - &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterGetFeature; - cuda_assert( - cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterWriteFeature; - cuda_assert(cuModuleGetFunction( - &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) -{ - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDetectOutliers; - cuda_assert(cuModuleGetFunction( - &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = { - &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); -} - -void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) -{ - denoising.functions.construct_transform = function_bind( - &CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(rtile); -} - -void CUDADevice::adaptive_sampling_filter(uint filter_sample, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream) -{ - const int num_threads_per_block = functions.adaptive_num_threads_per_block; - - /* These are a series of tiny kernels because there is no grid synchronization - * from within a kernel, so multiple kernel launches it is. */ - uint total_work_size = wtile->h * wtile->w; - void *args2[] = {&d_wtile, &filter_sample, &total_work_size}; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_stopping, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); - total_work_size = wtile->h; - num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_filter_x, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); - total_work_size = wtile->w; - num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_filter_y, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args2, - 0)); -} - -void CUDADevice::adaptive_sampling_post(RenderTile &rtile, - WorkTile *wtile, - CUdeviceptr d_wtile, - CUstream stream) -{ - const int num_threads_per_block = functions.adaptive_num_threads_per_block; - uint total_work_size = wtile->h * wtile->w; - - void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size}; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - 0, - stream, - args, - 0)); -} - -void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else if (task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - if (have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - WorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert( - cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - cuda_assert(cuCtxSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void CUDADevice::film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half) -{ - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); - CUdeviceptr d_buffer = (CUdeviceptr)buffer; - - /* get kernel function */ - if (rgba_half) { - cuda_assert( - cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); - } - else { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); - } - - float sample_scale = 1.0f / (task.sample + 1); - - /* pass in parameters */ - void *args[] = {&d_rgba, - &d_buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); - - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (task.w + xthreads - 1) / xthreads; - int yblocks = (task.h + ythreads - 1) / ythreads; - - cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(cuFilmConvert, - xblocks, - yblocks, - 1, /* blocks */ - xthreads, - ythreads, - 1, /* threads */ - 0, - 0, - args, - 0)); - - unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); - - cuda_assert(cuCtxSynchronize()); -} - -void CUDADevice::shader(DeviceTask &task) -{ - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuShader; - CUdeviceptr d_input = (CUdeviceptr)task.shader_input; - CUdeviceptr d_output = (CUdeviceptr)task.shader_output; - - /* get kernel function */ - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); - } - - /* do tasks in smaller chunks, so we can cancel it */ - const int shader_chunk_size = 65536; - const int start = task.shader_x; - const int end = task.shader_x + task.shader_w; - int offset = task.offset; - - bool canceled = false; - for (int sample = 0; sample < task.num_samples && !canceled; sample++) { - for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - int shader_w = min(shader_chunk_size, end - shader_x); - - /* pass in parameters */ - void *args[8]; - int arg = 0; - args[arg++] = &d_input; - args[arg++] = &d_output; - args[arg++] = &task.shader_eval_type; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - args[arg++] = &task.shader_filter; - } - args[arg++] = &shader_x; - args[arg++] = &shader_w; - args[arg++] = &offset; - args[arg++] = &sample; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuLaunchKernel(cuShader, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - cuda_assert(cuCtxSynchronize()); - - if (task.get_cancel()) { - canceled = true; - break; - } - } - - task.update_progress(NULL); - } -} - -CUdeviceptr CUDADevice::map_pixels(device_ptr mem) -{ - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - CUdeviceptr buffer; - - size_t bytes; - cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); - cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); - - return buffer; - } - - return (CUdeviceptr)mem; -} - -void CUDADevice::unmap_pixels(device_ptr mem) -{ - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - - cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); - } -} - -void CUDADevice::pixels_alloc(device_memory &mem) -{ - PixelMem pmem; - - pmem.w = mem.data_width; - pmem.h = mem.data_height; - - CUDAContextScope scope(this); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if (mem.data_type == TYPE_HALF) - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); - - CUresult result = cuGraphicsGLRegisterBuffer( - &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - - if (result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } -} - -void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h) -{ - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar) * 4 * y * w; - memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); -} - -void CUDADevice::pixels_free(device_memory &mem) -{ - if (mem.device_pointer) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } -} - -void CUDADevice::draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) -{ - assert(mem.type == MEM_PIXELS); - - if (!background) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - float *vpointer; - - CUDAContextScope scope(this); - - /* for multi devices, this assumes the inefficient method that we allocate - * all pixels on the device even though we only render to a subset */ - size_t offset = 4 * y * w; - - if (mem.data_type == TYPE_HALF) - offset *= sizeof(GLhalf); - else - offset *= sizeof(uint8_t); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); - } - else { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - if (transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if (use_fallback_shader) { - if (!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if (!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - - * avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if (vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = (float)w / (float)pmem.w; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = (float)w / (float)pmem.w; - vpointer[9] = (float)h / (float)pmem.h; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = (float)h / (float)pmem.h; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - glUnmapBuffer(GL_ARRAY_BUFFER); - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer( - texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, - 2, - GL_FLOAT, - GL_FALSE, - 4 * sizeof(float), - (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if (use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - if (transparent) { - glDisable(GL_BLEND); - } - - glBindTexture(GL_TEXTURE_2D, 0); - - return; - } - - Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); -} - -void CUDADevice::thread_run(DeviceTask &task) -{ - CUDAContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - DeviceRequestedFeatures requested_features; - if (use_split_kernel()) { - if (split_kernel == NULL) { - split_kernel = new CUDASplitKernel(this); - split_kernel->load_kernels(requested_features); - } - } - - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - if (use_split_kernel()) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, void_buffer, void_buffer); - } - else { - render(task, tile, work_tiles); - } - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - - denoise(tile, denoising); - - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } - else if (task.type == DeviceTask::SHADER) { - shader(task); - - cuda_assert(cuCtxSynchronize()); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - DenoisingTask denoising(this, task); - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } -} - -void CUDADevice::task_add(DeviceTask &task) -{ - CUDAContextScope scope(this); - - /* Load texture info. */ - load_texture_info(); - - /* Synchronize all memory copies before executing task. */ - cuda_assert(cuCtxSynchronize()); - - if (task.type == DeviceTask::FILM_CONVERT) { - /* must be done in main thread due to opengl access */ - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } -} - -void CUDADevice::task_wait() -{ - task_pool.wait(); -} - -void CUDADevice::task_cancel() -{ - task_pool.cancel(); -} - -/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class - * now that the definition of that class is complete - */ -# undef cuda_assert -# define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - if (result != CUDA_SUCCESS) { \ - const char *name = cuewErrorString(result); \ - device->set_error( \ - string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ - } \ - } \ - (void)0 - -/* CUDA context scope. */ - -CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) -{ - cuda_assert(cuCtxPushCurrent(device->cuContext)); -} - -CUDAContextScope::~CUDAContextScope() -{ - cuda_assert(cuCtxPopCurrent(NULL)); -} - -/* split kernel */ - -class CUDASplitKernelFunction : public SplitKernelFunction { - CUDADevice *device; - CUfunction func; - - public: - CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) - { - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) - { - return enqueue(dim, NULL); - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, void *args[]) - { - if (device->have_error()) - return false; - - CUDAContextScope scope(device); - - /* we ignore dim.local_size for now, as this is faster */ - int threads_per_block; - cuda_assert( - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - - int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / - threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(func, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - return !device->have_error(); - } -}; - -CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) -{ -} - -uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, - device_memory & /*data*/, - size_t num_threads) -{ - CUDAContextScope scope(device); - - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer; - - struct args_t { - uint *num_threads; - CUdeviceptr *size; - }; - - args_t args = {&threads, &d_size}; - - CUfunction state_buffer_size; - cuda_assert( - cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); - - cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - return size; -} - -bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory & /*kernel_globals*/, - device_memory & /*kernel_data*/, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) -{ - CUDAContextScope scope(device); - - CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer; - CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer; - CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer; - CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer; - CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer; - - CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer; - - int end_sample = rtile.start_sample + rtile.num_samples; - int queue_size = dim.global_size[0] * dim.global_size[1]; - - struct args_t { - CUdeviceptr *split_data_buffer; - int *num_elements; - CUdeviceptr *ray_state; - int *start_sample; - int *end_sample; - int *sx; - int *sy; - int *sw; - int *sh; - int *offset; - int *stride; - CUdeviceptr *queue_index; - int *queuesize; - CUdeviceptr *use_queues_flag; - CUdeviceptr *work_pool_wgs; - int *num_samples; - CUdeviceptr *buffer; - }; - - args_t args = {&d_split_data, - &num_global_elements, - &d_ray_state, - &rtile.start_sample, - &end_sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride, - &d_queue_index, - &queue_size, - &d_use_queues_flag, - &d_work_pool_wgs, - &rtile.num_samples, - &d_buffer}; - - CUfunction data_init; - cuda_assert( - cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); - if (device->have_error()) { - return false; - } - - CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); - - return !device->have_error(); -} - -SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) -{ - const CUDAContextScope scope(device); - - CUfunction func; - const CUresult result = cuModuleGetFunction( - &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()); - if (result != CUDA_SUCCESS) { - device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)", - kernel_name.data(), - cuewErrorString(result))); - return NULL; - } - - return new CUDASplitKernelFunction(device, func); -} - -int2 CUDASplitKernel::split_kernel_local_size() -{ - return make_int2(32, 1); -} - -int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask & /*task*/) -{ - CUDAContextScope scope(device); - size_t free; - size_t total; - - cuda_assert(cuMemGetInfo(&free, &total)); - - VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) - << " bytes. (" << string_human_readable_size(free) << ")."; - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - size_t side = round_down((int)sqrt(num_elements), 32); - int2 global_size = make_int2(side, round_down(num_elements / side, 16)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp new file mode 100644 index 00000000000..37fab8f8293 --- /dev/null +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -0,0 +1,1370 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/cuda/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +bool CUDADevice::have_precompiled_kernels() +{ + string cubins_path = path_get("lib"); + return path_exists(cubins_path); +} + +bool CUDADevice::show_samples() const +{ + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask CUDADevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void CUDADevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize CUDA. */ + CUresult result = cuInit(0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = cuDeviceGet(&cuDevice, cuDevId); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", + cuewErrorString(result))); + return; + } + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + cuda_assert(cuDeviceGetAttribute( + &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); + return; + } + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); +} + +CUDADevice::~CUDADevice() +{ + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); +} + +bool CUDADevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + set_error(string_printf( + "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); + return false; + } + + return true; +} + +bool CUDADevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); + + int can_access = 0; + cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + cuda_assert(cuDeviceGetP2PAttribute(&can_access, + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED, + cuDevice, + peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const CUDAContextScope scope(this); + CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + { + const CUDAContextScope scope(peer_device_cuda); + CUresult result = cuCtxEnablePeerAccess(cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); + return false; + } + } + + return true; +} + +bool CUDADevice::use_adaptive_compilation() +{ + return DebugFlags().cuda.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } + +# ifdef WITH_NANOVDB + cflags += " -DWITH_NANOVDB"; +# endif + + return cflags; +} + +string CUDADevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = force_ptx ? "ptx" : "cubin"; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string cubin_file = string_printf( + "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using locally compiled kernel."; + return cubin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("CUDA binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const nvcc = cuewCompilerPath(); + if (nvcc == NULL) { + set_error( + "CUDA nvcc compiler not found. " + "Install CUDA toolkit in default location."); + return string(); + } + + const int nvcc_cuda_version = cuewCompilerVersion(); + VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << "."; + if (nvcc_cuda_version < 101) { + printf( + "Unsupported CUDA version %d.%d detected, " + "you need CUDA 10.1 or newer.\n", + nvcc_cuda_version / 10, + nvcc_cuda_version % 10); + return string(); + } + else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 || + nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) { + printf( + "CUDA version %d.%d detected, build may succeed but only " + "CUDA 10.1 to 11.4 are officially supported.\n", + nvcc_cuda_version / 10, + nvcc_cuda_version % 10); + } + + double starttime = time_dt(); + + path_create_directories(cubin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cu", name)))); + + string command = string_printf( + "\"%s\" " + "-arch=%s_%d%d " + "--%s \"%s\" " + "-o \"%s\" " + "%s", + nvcc, + kernel_arch, + major, + minor, + kernel_ext, + source_path.c_str(), + cubin.c_str(), + common_cflags.c_str()); + + printf("Compiling CUDA kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(cubin)) { + set_error( + "CUDA kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return cubin; +} + +bool CUDADevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if (cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if cuda init succeeded */ + if (cuContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string cubin = compile_kernel(kernel_features, kernel_name); + if (cubin.empty()) + return false; + + /* open module */ + CUDAContextScope scope(this); + + string cubin_data; + CUresult result; + + if (path_read_text(cubin, cubin_data)) + result = cuModuleLoadData(&cuModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (result != CUDA_SUCCESS) + set_error(string_printf( + "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result))); + + if (result == CUDA_SUCCESS) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == CUDA_SUCCESS); +} + +void CUDADevice::reserve_local_memory(const uint /* kernel_features */) +{ + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + CUDAContextScope scope(this); + cuMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + CUDADeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + CUDAContextScope scope(this); + cuMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } +# endif +} + +void CUDADevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void CUDADevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void CUDADevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(cuda_mem_map_mutex); + foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + + assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || + (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); + } + + if (mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + CUDAMem *cmem = &cuda_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void CUDADevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const CUDAContextScope scope(this); + cuda_assert( + cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void CUDADevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + thread_scoped_lock lock(cuda_mem_map_mutex); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + cuMemFreeHost(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuda_assert(cuMemFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } +} + +void CUDADevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void CUDADevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemcpyDtoH( + (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void CUDADevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(cuda_mem_map_mutex); + if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void CUDADevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CUDADevice::const_copy_to(const char *name, void *host, size_t size) +{ + CUDAContextScope scope(this); + CUdeviceptr mem; + size_t bytes; + + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); + // assert(bytes == size); + cuda_assert(cuMemcpyHtoD(mem, host, size)); +} + +void CUDADevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void CUDADevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void CUDADevice::tex_alloc(device_texture &mem) +{ + CUDAContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (CUarray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(cuda_mem_map_mutex); + cmem = &cuda_mem_map[&mem]; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void CUDADevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + thread_scoped_lock lock(cuda_mem_map_mutex); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void CUDADevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + CUDAContextScope scope(this); + CUfunction cuRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); + } + + if (have_error()) { + return; + } + + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert( + cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + cuda_assert(cuCtxSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void CUDADevice::thread_run(DeviceTask &task) +{ + CUDAContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create() +{ + return make_unique<CUDADeviceQueue>(this); +} + +bool CUDADevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using CUDA device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + CUDAContextScope scope(this); + + int num_all_devices = 0; + cuda_assert(cuDeviceGetCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<CUdevice> gl_devices(num_all_devices); + uint num_gl_devices; + cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL); + + for (CUdevice gl_device : gl_devices) { + if (gl_device == cuDevice) { + return true; + } + } + + return false; +} + +int CUDADevice::get_num_multiprocessors() +{ + return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0); +} + +int CUDADevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0); +} + +bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value) +{ + CUDAContextScope scope(this); + + return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS; +} + +int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h new file mode 100644 index 00000000000..6b27db54ab4 --- /dev/null +++ b/intern/cycles/device/cuda/device_impl.h @@ -0,0 +1,155 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/kernel.h" +# include "device/cuda/queue.h" +# include "device/cuda/util.h" +# include "device/device.h" + +# include "util/util_map.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include "util/util_opengl.h" +# include <cuda.h> +# include <cudaGL.h> +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class CUDADevice : public Device { + + friend class CUDAContextScope; + + public: + CUdevice cuDevice; + CUcontext cuContext; + CUmodule cuModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int cuDevId; + int cuDevArchitecture; + bool first_error; + + struct CUDAMem { + CUDAMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + CUtexObject texobject; + CUarray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; + thread_mutex cuda_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + CUDADeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~CUDADevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "cuda", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(CUdevice_attribute attribute, int *value); + int get_device_default_attribute(CUdevice_attribute attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp new file mode 100644 index 00000000000..e8ca8b90eae --- /dev/null +++ b/intern/cycles/device/cuda/graphics_interop.cpp @@ -0,0 +1,102 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/graphics_interop.h" + +# include "device/cuda/device_impl.h" +# include "device/cuda/util.h" + +CCL_NAMESPACE_BEGIN + +CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue) + : queue_(queue), device_(static_cast<CUDADevice *>(queue->device)) +{ +} + +CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop() +{ + CUDAContextScope scope(device_); + + if (cu_graphics_resource_) { + cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_)); + } +} + +void CUDADeviceGraphicsInterop::set_destination( + const DeviceGraphicsInteropDestination &destination) +{ + const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + + need_clear_ = destination.need_clear; + + if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + CUDAContextScope scope(device_); + + if (cu_graphics_resource_) { + cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_)); + } + + const CUresult result = cuGraphicsGLRegisterBuffer( + &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + if (result != CUDA_SUCCESS) { + LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result); + } + + opengl_pbo_id_ = destination.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr CUDADeviceGraphicsInterop::map() +{ + if (!cu_graphics_resource_) { + return 0; + } + + CUDAContextScope scope(device_); + + CUdeviceptr cu_buffer; + size_t bytes; + + cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream())); + cuda_device_assert( + device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_)); + + if (need_clear_) { + cuda_device_assert( + device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream())); + + need_clear_ = false; + } + + return static_cast<device_ptr>(cu_buffer); +} + +void CUDADeviceGraphicsInterop::unmap() +{ + CUDAContextScope scope(device_); + + cuda_device_assert(device_, + cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h new file mode 100644 index 00000000000..8a70c8aa71d --- /dev/null +++ b/intern/cycles/device/cuda/graphics_interop.h @@ -0,0 +1,66 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/device_graphics_interop.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; +class CUDADeviceQueue; + +class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue); + + CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete; + CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete; + + ~CUDADeviceGraphicsInterop(); + + CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete; + CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete; + + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + CUDADeviceQueue *queue_ = nullptr; + CUDADevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + /* The destination was requested to be cleared. */ + bool need_clear_ = false; + + CUgraphicsResource cu_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp new file mode 100644 index 00000000000..0ed20ddf8e6 --- /dev/null +++ b/intern/cycles/device/cuda/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/kernel.h" +# include "device/cuda/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void CUDADeviceKernels::load(CUDADevice *device) +{ + CUmodule cuModule = device->cuModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + CUDADeviceKernel &kernel = kernels_[i]; + + /* No megakernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + cuda_device_assert(device, + cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str())); + + if (kernel.function) { + cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1)); + + cuda_device_assert( + device, + cuOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool CUDADeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA*/ diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h new file mode 100644 index 00000000000..b489547a350 --- /dev/null +++ b/intern/cycles/device/cuda/kernel.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# include "device/device_kernel.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +/* CUDA kernel and associate occupancy information. */ +class CUDADeviceKernel { + public: + CUfunction function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of CUDA kernels for each DeviceKernel. */ +class CUDADeviceKernels { + public: + void load(CUDADevice *device); + const CUDADeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp new file mode 100644 index 00000000000..b7f86c10553 --- /dev/null +++ b/intern/cycles/device/cuda/queue.cpp @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/queue.h" + +# include "device/cuda/device_impl.h" +# include "device/cuda/graphics_interop.h" +# include "device/cuda/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* CUDADeviceQueue */ + +CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device) + : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr) +{ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING)); +} + +CUDADeviceQueue::~CUDADeviceQueue() +{ + const CUDAContextScope scope(cuda_device_); + cuStreamDestroy(cuda_stream_); +} + +int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const +{ + int num_states = max(cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor() * 16, + 1048576); + + const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); + if (factor_str) { + num_states = max((int)(num_states * atof(factor_str)), 1024); + } + + VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int CUDADeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void CUDADeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + CUDAContextScope scope(cuda_device_); + cuda_device_->load_texture_info(); + cuda_device_assert(cuda_device_, cuCtxSynchronize()); + + debug_init_execution(); +} + +bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return cuda_device_->kernels.available(kernel); +} + +bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (cuda_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const CUDAContextScope scope(cuda_device_); + const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = cuda_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + + default: + break; + } + + /* Launch kernel. */ + cuda_device_assert(cuda_device_, + cuLaunchKernel(cuda_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + cuda_stream_, + args, + 0)); + + return !(cuda_device_->have_error()); +} + +bool CUDADeviceQueue::synchronize() +{ + if (cuda_device_->have_error()) { + return false; + } + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + debug_synchronize(); + + return !(cuda_device_->have_error()); +} + +void CUDADeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync( + (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyDtoHAsync( + mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); +} + +unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create() +{ + return make_unique<CUDADeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h new file mode 100644 index 00000000000..62e3aa3d6c2 --- /dev/null +++ b/intern/cycles/device/cuda/queue.h @@ -0,0 +1,67 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/cuda/util.h" + +CCL_NAMESPACE_BEGIN + +class CUDADevice; +class device_memory; + +/* Base class for CUDA queues. */ +class CUDADeviceQueue : public DeviceQueue { + public: + CUDADeviceQueue(CUDADevice *device); + ~CUDADeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual CUstream stream() + { + return cuda_stream_; + } + + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + CUDADevice *cuda_device_; + CUstream cuda_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp new file mode 100644 index 00000000000..8f657cc10fe --- /dev/null +++ b/intern/cycles/device/cuda/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/util.h" +# include "device/cuda/device_impl.h" + +CCL_NAMESPACE_BEGIN + +CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) +{ + cuda_device_assert(device, cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_device_assert(device, cuCtxPopCurrent(NULL)); +} + +# ifndef WITH_CUDA_DYNLOAD +const char *cuewErrorString(CUresult result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *cuewCompilerPath() +{ + return CYCLES_CUDA_NVCC_EXECUTABLE; +} + +int cuewCompilerVersion() +{ + return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h new file mode 100644 index 00000000000..a0898094c08 --- /dev/null +++ b/intern/cycles/device/cuda/util.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_CUDA + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include <cuda.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice; + +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { + public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + + private: + CUDADevice *device; +}; + +/* Utility for checking return values of CUDA function calls. */ +# define cuda_device_assert(cuda_device, stmt) \ + { \ + CUresult result = stmt; \ + if (result != CUDA_SUCCESS) { \ + const char *name = cuewErrorString(result); \ + cuda_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define cuda_assert(stmt) cuda_device_assert(this, stmt) + +# ifndef WITH_CUDA_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked CUDA at all. */ +const char *cuewErrorString(CUresult result); +const char *cuewCompilerPath(); +int cuewCompilerVersion(); +# endif /* WITH_CUDA_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index ed53fbb54ae..6ccedcf54ef 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -20,7 +20,13 @@ #include "bvh/bvh2.h" #include "device/device.h" -#include "device/device_intern.h" +#include "device/device_queue.h" + +#include "device/cpu/device.h" +#include "device/cuda/device.h" +#include "device/dummy/device.h" +#include "device/multi/device.h" +#include "device/optix/device.h" #include "util/util_foreach.h" #include "util/util_half.h" @@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN bool Device::need_types_update = true; bool Device::need_devices_update = true; thread_mutex Device::device_mutex; -vector<DeviceInfo> Device::opencl_devices; vector<DeviceInfo> Device::cuda_devices; vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; -vector<DeviceInfo> Device::network_devices; uint Device::devices_initialized_mask = 0; -/* Device Requested Features */ - -std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features) -{ - os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl; - os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; - /* TODO(sergey): Decode bitflag into list of names. */ - os << "Nodes features: " << requested_features.nodes_features << std::endl; - os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl; - os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion) - << std::endl; - os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion) - << std::endl; - os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl; - os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl; - os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl; - os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched) - << std::endl; - os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation) - << std::endl; - os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent) - << std::endl; - os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled) - << std::endl; - os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl; - os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement) - << std::endl; - os << "Use Background Light: " << string_from_bool(requested_features.use_background_light) - << std::endl; - return os; -} - /* Device */ Device::~Device() noexcept(false) { - if (!background) { - if (vertex_buffer != 0) { - glDeleteBuffers(1, &vertex_buffer); - } - if (fallback_shader_program != 0) { - glDeleteProgram(fallback_shader_program); - } - } -} - -/* TODO move shaders to standalone .glsl file. */ -const char *FALLBACK_VERTEX_SHADER = - "#version 330\n" - "uniform vec2 fullscreen;\n" - "in vec2 texCoord;\n" - "in vec2 pos;\n" - "out vec2 texCoord_interp;\n" - "\n" - "vec2 normalize_coordinates()\n" - "{\n" - " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n" - "}\n" - "\n" - "void main()\n" - "{\n" - " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n" - " texCoord_interp = texCoord;\n" - "}\n\0"; - -const char *FALLBACK_FRAGMENT_SHADER = - "#version 330\n" - "uniform sampler2D image_texture;\n" - "in vec2 texCoord_interp;\n" - "out vec4 fragColor;\n" - "\n" - "void main()\n" - "{\n" - " fragColor = texture(image_texture, texCoord_interp);\n" - "}\n\0"; - -static void shader_print_errors(const char *task, const char *log, const char *code) -{ - LOG(ERROR) << "Shader: " << task << " error:"; - LOG(ERROR) << "===== shader string ===="; - - stringstream stream(code); - string partial; - - int line = 1; - while (getline(stream, partial, '\n')) { - if (line < 10) { - LOG(ERROR) << " " << line << " " << partial; - } - else { - LOG(ERROR) << line << " " << partial; - } - line++; - } - LOG(ERROR) << log; -} - -static int bind_fallback_shader(void) -{ - GLint status; - GLchar log[5000]; - GLsizei length = 0; - GLuint program = 0; - - struct Shader { - const char *source; - GLenum type; - } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER}, - {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}}; - - program = glCreateProgram(); - - for (int i = 0; i < 2; i++) { - GLuint shader = glCreateShader(shaders[i].type); - - string source_str = shaders[i].source; - const char *c_str = source_str.c_str(); - - glShaderSource(shader, 1, &c_str, NULL); - glCompileShader(shader); - - glGetShaderiv(shader, GL_COMPILE_STATUS, &status); - - if (!status) { - glGetShaderInfoLog(shader, sizeof(log), &length, log); - shader_print_errors("compile", log, c_str); - return 0; - } - - glAttachShader(program, shader); - } - - /* Link output. */ - glBindFragDataLocation(program, 0, "fragColor"); - - /* Link and error check. */ - glLinkProgram(program); - - glGetProgramiv(program, GL_LINK_STATUS, &status); - if (!status) { - glGetShaderInfoLog(program, sizeof(log), &length, log); - shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER); - shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER); - return 0; - } - - return program; -} - -bool Device::bind_fallback_display_space_shader(const float width, const float height) -{ - if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) { - return false; - } - - if (fallback_status == FALLBACK_SHADER_STATUS_NONE) { - fallback_shader_program = bind_fallback_shader(); - fallback_status = FALLBACK_SHADER_STATUS_ERROR; - - if (fallback_shader_program == 0) { - return false; - } - - glUseProgram(fallback_shader_program); - image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture"); - if (image_texture_location < 0) { - LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform."; - return false; - } - - fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen"); - if (fullscreen_location < 0) { - LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform."; - return false; - } - - fallback_status = FALLBACK_SHADER_STATUS_SUCCESS; - } - - /* Run this every time. */ - glUseProgram(fallback_shader_program); - glUniform1i(image_texture_location, 0); - glUniform2f(fullscreen_location, width, height); - return true; -} - -void Device::draw_pixels(device_memory &rgba, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) -{ - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - - assert(rgba.type == MEM_PIXELS); - mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1)); - - GLuint texid; - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - - if (rgba.data_type == TYPE_HALF) { - GLhalf *data_pointer = (GLhalf *)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer); - } - else { - uint8_t *data_pointer = (uint8_t *)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer); - } - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - - if (transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if (use_fallback_shader) { - if (!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if (!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered - */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if (vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = 1.0f; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = 1.0f; - vpointer[9] = 1.0f; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = 1.0f; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - if (vertex_buffer) { - glUnmapBuffer(GL_ARRAY_BUFFER); - } - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer( - texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, - 2, - GL_FLOAT, - GL_FALSE, - 4 * sizeof(float), - (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if (vertex_buffer) { - glBindBuffer(GL_ARRAY_BUFFER, 0); - } - - if (use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - glDeleteVertexArrays(1, &vertex_array_object); - glBindTexture(GL_TEXTURE_2D, 0); - glDeleteTextures(1, &texid); - - if (transparent) { - glDisable(GL_BLEND); - } } void Device::build_bvh(BVH *bvh, Progress &progress, bool refit) @@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit) } } -Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { #ifdef WITH_MULTI if (!info.multi_devices.empty()) { /* Always create a multi device when info contains multiple devices. * This is done so that the type can still be e.g. DEVICE_CPU to indicate * that it is a homogeneous collection of devices, which simplifies checks. */ - return device_multi_create(info, stats, profiler, background); + return device_multi_create(info, stats, profiler); } #endif @@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool switch (info.type) { case DEVICE_CPU: - device = device_cpu_create(info, stats, profiler, background); + device = device_cpu_create(info, stats, profiler); break; #ifdef WITH_CUDA case DEVICE_CUDA: if (device_cuda_init()) - device = device_cuda_create(info, stats, profiler, background); + device = device_cuda_create(info, stats, profiler); break; #endif #ifdef WITH_OPTIX case DEVICE_OPTIX: if (device_optix_init()) - device = device_optix_create(info, stats, profiler, background); - break; -#endif -#ifdef WITH_NETWORK - case DEVICE_NETWORK: - device = device_network_create(info, stats, profiler, "127.0.0.1"); - break; -#endif -#ifdef WITH_OPENCL - case DEVICE_OPENCL: - if (device_opencl_init()) - device = device_opencl_create(info, stats, profiler, background); + device = device_optix_create(info, stats, profiler); break; #endif default: @@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool } if (device == NULL) { - device = device_dummy_create(info, stats, profiler, background); + device = device_dummy_create(info, stats, profiler); } return device; @@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_CUDA; else if (strcmp(name, "OPTIX") == 0) return DEVICE_OPTIX; - else if (strcmp(name, "OPENCL") == 0) - return DEVICE_OPENCL; - else if (strcmp(name, "NETWORK") == 0) - return DEVICE_NETWORK; else if (strcmp(name, "MULTI") == 0) return DEVICE_MULTI; @@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type) return "CUDA"; else if (type == DEVICE_OPTIX) return "OPTIX"; - else if (type == DEVICE_OPENCL) - return "OPENCL"; - else if (type == DEVICE_NETWORK) - return "NETWORK"; else if (type == DEVICE_MULTI) return "MULTI"; @@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types() #ifdef WITH_OPTIX types.push_back(DEVICE_OPTIX); #endif -#ifdef WITH_OPENCL - types.push_back(DEVICE_OPENCL); -#endif -#ifdef WITH_NETWORK - types.push_back(DEVICE_NETWORK); -#endif return types; } @@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask) thread_scoped_lock lock(device_mutex); vector<DeviceInfo> devices; -#ifdef WITH_OPENCL - if (mask & DEVICE_MASK_OPENCL) { - if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) { - if (device_opencl_init()) { - device_opencl_info(opencl_devices); - } - devices_initialized_mask |= DEVICE_MASK_OPENCL; - } - foreach (DeviceInfo &info, opencl_devices) { - devices.push_back(info); - } - } -#endif - #if defined(WITH_CUDA) || defined(WITH_OPTIX) if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) { if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) { @@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask) } } -#ifdef WITH_NETWORK - if (mask & DEVICE_MASK_NETWORK) { - if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) { - device_network_info(network_devices); - devices_initialized_mask |= DEVICE_MASK_NETWORK; - } - foreach (DeviceInfo &info, network_devices) { - devices.push_back(info); - } - } -#endif - return devices; } @@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask) capabilities += device_cpu_capabilities() + "\n"; } -#ifdef WITH_OPENCL - if (mask & DEVICE_MASK_OPENCL) { - if (device_opencl_init()) { - capabilities += "\nOpenCL device capabilities:\n"; - capabilities += device_opencl_capabilities(); - } - } -#endif - #ifdef WITH_CUDA if (mask & DEVICE_MASK_CUDA) { if (device_cuda_init()) { @@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, } DeviceInfo info; - info.type = subdevices.front().type; + info.type = DEVICE_NONE; info.id = "MULTI"; info.description = "Multi Device"; info.num = 0; info.has_half_images = true; info.has_nanovdb = true; - info.has_volume_decoupled = true; - info.has_branched_path = true; - info.has_adaptive_stop_per_sample = true; info.has_osl = true; info.has_profiling = true; info.has_peer_memory = false; @@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.id += device.id; /* Set device type to MULTI if subdevices are not of a common type. */ - if (device.type != info.type) { + if (info.type == DEVICE_NONE) { + info.type = device.type; + } + else if (device.type != info.type) { info.type = DEVICE_MULTI; } /* Accumulate device info. */ info.has_half_images &= device.has_half_images; info.has_nanovdb &= device.has_nanovdb; - info.has_volume_decoupled &= device.has_volume_decoupled; - info.has_branched_path &= device.has_branched_path; - info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; @@ -689,60 +315,32 @@ void Device::free_memory() devices_initialized_mask = 0; cuda_devices.free_memory(); optix_devices.free_memory(); - opencl_devices.free_memory(); cpu_devices.free_memory(); - network_devices.free_memory(); } -/* DeviceInfo */ - -void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type) +unique_ptr<DeviceQueue> Device::gpu_queue_create() { - assert(denoising_devices.empty()); - - if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) { - vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX); - if (!optix_devices.empty()) { - /* Convert to a special multi device with separate denoising devices. */ - if (multi_devices.empty()) { - multi_devices.push_back(*this); - } - - /* Try to use the same physical devices for denoising. */ - for (const DeviceInfo &cuda_device : multi_devices) { - if (cuda_device.type == DEVICE_CUDA) { - for (const DeviceInfo &optix_device : optix_devices) { - if (cuda_device.num == optix_device.num) { - id += optix_device.id; - denoising_devices.push_back(optix_device); - break; - } - } - } - } - - if (denoising_devices.empty()) { - /* Simply use the first available OptiX device. */ - const DeviceInfo optix_device = optix_devices.front(); - id += optix_device.id; /* Uniquely identify this special multi device. */ - denoising_devices.push_back(optix_device); - } + LOG(FATAL) << "Device does not support queues."; + return nullptr; +} - denoisers = denoiser_type; - } - } - else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) { - /* Convert to a special multi device with separate denoising devices. */ - if (multi_devices.empty()) { - multi_devices.push_back(*this); - } +const CPUKernels *Device::get_cpu_kernels() const +{ + LOG(FATAL) << "Device does not support CPU kernels."; + return nullptr; +} - /* Add CPU denoising devices. */ - DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front(); - denoising_devices.push_back(cpu_device); +void Device::get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/) +{ + LOG(FATAL) << "Device does not support CPU kernels."; +} - denoisers = denoiser_type; - } +void *Device::get_cpu_osl_memory() +{ + return nullptr; } +/* DeviceInfo */ + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ecf79bcdfa6..02b6edb56d0 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -21,31 +21,34 @@ #include "bvh/bvh_params.h" +#include "device/device_denoise.h" #include "device/device_memory.h" -#include "device/device_task.h" +#include "util/util_function.h" #include "util/util_list.h" +#include "util/util_logging.h" #include "util/util_stats.h" #include "util/util_string.h" #include "util/util_texture.h" #include "util/util_thread.h" #include "util/util_types.h" +#include "util/util_unique_ptr.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN class BVH; +class DeviceQueue; class Progress; -class RenderTile; +class CPUKernels; +class CPUKernelThreadGlobals; /* Device Types */ enum DeviceType { DEVICE_NONE = 0, DEVICE_CPU, - DEVICE_OPENCL, DEVICE_CUDA, - DEVICE_NETWORK, DEVICE_MULTI, DEVICE_OPTIX, DEVICE_DUMMY, @@ -53,20 +56,11 @@ enum DeviceType { enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), - DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), - DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK), DEVICE_MASK_ALL = ~0 }; -enum DeviceKernelStatus { - DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE, - DEVICE_KERNEL_USING_FEATURE_KERNEL, - DEVICE_KERNEL_FEATURE_KERNEL_INVALID, - DEVICE_KERNEL_UNKNOWN, -}; - #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type) class DeviceInfo { @@ -75,20 +69,16 @@ class DeviceInfo { string description; string id; /* used for user preferences, should stay fixed with changing hardware config */ int num; - bool display_device; /* GPU is used as a display device. */ - bool has_half_images; /* Support half-float textures. */ - bool has_nanovdb; /* Support NanoVDB volumes. */ - bool has_volume_decoupled; /* Decoupled volume shading. */ - bool has_branched_path; /* Supports branched path tracing. */ - bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */ - bool has_osl; /* Support Open Shading Language. */ - bool use_split_kernel; /* Use split or mega kernel. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ - bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ - DenoiserTypeMask denoisers; /* Supported denoiser types. */ + bool display_device; /* GPU is used as a display device. */ + bool has_nanovdb; /* Support NanoVDB volumes. */ + bool has_half_images; /* Support half-float textures. */ + bool has_osl; /* Support Open Shading Language. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ + bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + bool has_gpu_queue; /* Device supports GPU queue. */ + DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector<DeviceInfo> multi_devices; - vector<DeviceInfo> denoising_devices; string error_msg; DeviceInfo() @@ -100,227 +90,35 @@ class DeviceInfo { display_device = false; has_half_images = false; has_nanovdb = false; - has_volume_decoupled = false; - has_branched_path = true; - has_adaptive_stop_per_sample = false; has_osl = false; - use_split_kernel = false; has_profiling = false; has_peer_memory = false; + has_gpu_queue = false; denoisers = DENOISER_NONE; } - bool operator==(const DeviceInfo &info) + bool operator==(const DeviceInfo &info) const { /* Multiple Devices with the same ID would be very bad. */ assert(id != info.id || (type == info.type && num == info.num && description == info.description)); return id == info.id; } - - /* Add additional devices needed for the specified denoiser. */ - void add_denoising_devices(DenoiserType denoiser_type); -}; - -class DeviceRequestedFeatures { - public: - /* Use experimental feature set. */ - bool experimental; - - /* Selective nodes compilation. */ - - /* Identifier of a node group up to which all the nodes needs to be - * compiled in. Nodes from higher group indices will be ignores. - */ - int max_nodes_group; - - /* Features bitfield indicating which features from the requested group - * will be compiled in. Nodes which corresponds to features which are not - * in this bitfield will be ignored even if they're in the requested group. - */ - int nodes_features; - - /* BVH/sampling kernel features. */ - bool use_hair; - bool use_hair_thick; - bool use_object_motion; - bool use_camera_motion; - - /* Denotes whether baking functionality is needed. */ - bool use_baking; - - /* Use subsurface scattering materials. */ - bool use_subsurface; - - /* Use volume materials. */ - bool use_volume; - - /* Use branched integrator. */ - bool use_integrator_branched; - - /* Use OpenSubdiv patch evaluation */ - bool use_patch_evaluation; - - /* Use Transparent shadows */ - bool use_transparent; - - /* Use various shadow tricks, such as shadow catcher. */ - bool use_shadow_tricks; - - /* Per-uber shader usage flags. */ - bool use_principled; - - /* Denoising features. */ - bool use_denoising; - - /* Use raytracing in shaders. */ - bool use_shader_raytrace; - - /* Use true displacement */ - bool use_true_displacement; - - /* Use background lights */ - bool use_background_light; - - DeviceRequestedFeatures() - { - /* TODO(sergey): Find more meaningful defaults. */ - max_nodes_group = 0; - nodes_features = 0; - use_hair = false; - use_hair_thick = false; - use_object_motion = false; - use_camera_motion = false; - use_baking = false; - use_subsurface = false; - use_volume = false; - use_integrator_branched = false; - use_patch_evaluation = false; - use_transparent = false; - use_shadow_tricks = false; - use_principled = false; - use_denoising = false; - use_shader_raytrace = false; - use_true_displacement = false; - use_background_light = false; - } - - bool modified(const DeviceRequestedFeatures &requested_features) - { - return !(max_nodes_group == requested_features.max_nodes_group && - nodes_features == requested_features.nodes_features && - use_hair == requested_features.use_hair && - use_hair_thick == requested_features.use_hair_thick && - use_object_motion == requested_features.use_object_motion && - use_camera_motion == requested_features.use_camera_motion && - use_baking == requested_features.use_baking && - use_subsurface == requested_features.use_subsurface && - use_volume == requested_features.use_volume && - use_integrator_branched == requested_features.use_integrator_branched && - use_patch_evaluation == requested_features.use_patch_evaluation && - use_transparent == requested_features.use_transparent && - use_shadow_tricks == requested_features.use_shadow_tricks && - use_principled == requested_features.use_principled && - use_denoising == requested_features.use_denoising && - use_shader_raytrace == requested_features.use_shader_raytrace && - use_true_displacement == requested_features.use_true_displacement && - use_background_light == requested_features.use_background_light); - } - - /* Convert the requested features structure to a build options, - * which could then be passed to compilers. - */ - string get_build_options() const - { - string build_options = ""; - if (experimental) { - build_options += "-D__KERNEL_EXPERIMENTAL__ "; - } - build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group); - build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features); - if (!use_hair) { - build_options += " -D__NO_HAIR__"; - } - if (!use_object_motion) { - build_options += " -D__NO_OBJECT_MOTION__"; - } - if (!use_camera_motion) { - build_options += " -D__NO_CAMERA_MOTION__"; - } - if (!use_baking) { - build_options += " -D__NO_BAKING__"; - } - if (!use_volume) { - build_options += " -D__NO_VOLUME__"; - } - if (!use_subsurface) { - build_options += " -D__NO_SUBSURFACE__"; - } - if (!use_integrator_branched) { - build_options += " -D__NO_BRANCHED_PATH__"; - } - if (!use_patch_evaluation) { - build_options += " -D__NO_PATCH_EVAL__"; - } - if (!use_transparent && !use_volume) { - build_options += " -D__NO_TRANSPARENT__"; - } - if (!use_shadow_tricks) { - build_options += " -D__NO_SHADOW_TRICKS__"; - } - if (!use_principled) { - build_options += " -D__NO_PRINCIPLED__"; - } - if (!use_denoising) { - build_options += " -D__NO_DENOISING__"; - } - if (!use_shader_raytrace) { - build_options += " -D__NO_SHADER_RAYTRACE__"; - } - return build_options; - } }; -std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features); - /* Device */ -struct DeviceDrawParams { - function<void()> bind_display_space_shader_cb; - function<void()> unbind_display_space_shader_cb; -}; - class Device { friend class device_sub_ptr; protected: - enum { - FALLBACK_SHADER_STATUS_NONE = 0, - FALLBACK_SHADER_STATUS_ERROR, - FALLBACK_SHADER_STATUS_SUCCESS, - }; - - Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background) - : background(background), - vertex_buffer(0), - fallback_status(FALLBACK_SHADER_STATUS_NONE), - fallback_shader_program(0), - info(info_), - stats(stats_), - profiler(profiler_) + Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : info(info_), stats(stats_), profiler(profiler_) { } - bool background; string error_msg; - /* used for real time display */ - unsigned int vertex_buffer; - int fallback_status, fallback_shader_program; - int image_texture_location, fullscreen_location; - - bool bind_fallback_display_space_shader(const float width, const float height); - virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/) { /* Only required for devices that implement denoising. */ @@ -361,67 +159,31 @@ class Device { Stats &stats; Profiler &profiler; - /* memory alignment */ - virtual int mem_sub_ptr_alignment() - { - return MIN_ALIGNMENT_CPU_DATA_TYPES; - } - /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; - /* open shading language, only for CPU device */ - virtual void *osl_memory() - { - return NULL; - } - /* load/compile kernels, must be called before adding tasks */ - virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/) + virtual bool load_kernels(uint /*kernel_features*/) { return true; } - /* Wait for device to become available to upload data and receive tasks - * This method is used by the OpenCL device to load the - * optimized kernels or when not (yet) available load the - * generic kernels (only during foreground rendering) */ - virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/) - { - return true; - } - /* Check if there are 'better' kernels available to be used - * We can switch over to these kernels - * This method is used to determine if we can switch the preview kernels - * to regular kernels */ - virtual DeviceKernelStatus get_active_kernel_switch_state() - { - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } + /* GPU device only functions. + * These may not be used on CPU or multi-devices. */ - /* tasks */ - virtual int get_split_task_count(DeviceTask &) - { - return 1; - } + /* Create new queue for executing kernels in. */ + virtual unique_ptr<DeviceQueue> gpu_queue_create(); + + /* CPU device only functions. + * These may not be used on GPU or multi-devices. */ - virtual void task_add(DeviceTask &task) = 0; - virtual void task_wait() = 0; - virtual void task_cancel() = 0; - - /* opengl drawing */ - virtual void draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params); + /* Get CPU kernel functions for native instruction set. */ + virtual const CPUKernels *get_cpu_kernels() const; + /* Get kernel globals to pass to kernels. */ + virtual void get_cpu_kernel_thread_globals( + vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/); + /* Get OpenShadingLanguage memory buffer. */ + virtual void *get_cpu_osl_memory(); /* acceleration structure building */ virtual void build_bvh(BVH *bvh, Progress &progress, bool refit); @@ -429,25 +191,11 @@ class Device { /* OptiX specific destructor. */ virtual void release_optix_bvh(BVH * /*bvh*/){}; -#ifdef WITH_NETWORK - /* networking */ - void server_run(); -#endif - /* multi device */ - virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/) - { - } virtual int device_number(Device * /*sub_device*/) { return 0; } - virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) - { - } - virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) - { - } virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { @@ -460,11 +208,47 @@ class Device { return false; } + /* Graphics resources interoperability. + * + * The interoperability comes here by the meaning that the device is capable of computing result + * directly into an OpenGL (or other graphics library) buffer. */ + + /* Check display si to be updated using graphics interoperability. + * The interoperability can not be used is it is not supported by the device. But the device + * might also force disable the interoperability if it detects that it will be slower than + * copying pixels from the render buffer. */ + virtual bool should_use_graphics_interop() + { + return false; + } + + /* Buffer denoising. */ + + /* Returns true if task is fully handled. */ + virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/) + { + LOG(ERROR) << "Request buffer denoising from a device which does not support it."; + return false; + } + + virtual DeviceQueue *get_denoise_queue() + { + LOG(ERROR) << "Request denoising queue from a device which does not support it."; + return nullptr; + } + + /* Sub-devices */ + + /* Run given callback for every individual device which will be handling rendering. + * For the single device the callback is called for the device itself. For the multi-device the + * callback is only called for the sub-devices. */ + virtual void foreach_device(const function<void(Device *)> &callback) + { + callback(this); + } + /* static */ - static Device *create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background = true); + static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler); static DeviceType type_from_string(const char *name); static string string_from_type(DeviceType type); @@ -499,9 +283,7 @@ class Device { static thread_mutex device_mutex; static vector<DeviceInfo> cuda_devices; static vector<DeviceInfo> optix_devices; - static vector<DeviceInfo> opencl_devices; static vector<DeviceInfo> cpu_devices; - static vector<DeviceInfo> network_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp deleted file mode 100644 index 4a6e77d6eaa..00000000000 --- a/intern/cycles/device/device_cpu.cpp +++ /dev/null @@ -1,1680 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include <string.h> - -/* So ImathMath is included before our kernel_cpu_compat. */ -#ifdef WITH_OSL -/* So no context pollution happens from indirectly included windows.h */ -# include "util/util_windows.h" -# include <OSL/oslexec.h> -#endif - -#ifdef WITH_EMBREE -# include <embree3/rtcore.h> -#endif - -#include "device/device.h" -#include "device/device_denoising.h" -#include "device/device_intern.h" -#include "device/device_split_kernel.h" - -// clang-format off -#include "kernel/kernel.h" -#include "kernel/kernel_compat_cpu.h" -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_adaptive_sampling.h" - -#include "kernel/filter/filter.h" - -#include "kernel/osl/osl_shader.h" -#include "kernel/osl/osl_globals.h" -// clang-format on - -#include "bvh/bvh_embree.h" - -#include "render/buffers.h" -#include "render/coverage.h" - -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_function.h" -#include "util/util_logging.h" -#include "util/util_map.h" -#include "util/util_opengl.h" -#include "util/util_openimagedenoise.h" -#include "util/util_optimization.h" -#include "util/util_progress.h" -#include "util/util_system.h" -#include "util/util_task.h" -#include "util/util_thread.h" - -CCL_NAMESPACE_BEGIN - -class CPUDevice; - -/* Has to be outside of the class to be shared across template instantiations. */ -static const char *logged_architecture = ""; - -template<typename F> class KernelFunctions { - public: - KernelFunctions() - { - kernel = (F)NULL; - } - - KernelFunctions( - F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2) - { - const char *architecture_name = "default"; - kernel = kernel_default; - - /* Silence potential warnings about unused variables - * when compiling without some architectures. */ - (void)kernel_sse2; - (void)kernel_sse3; - (void)kernel_sse41; - (void)kernel_avx; - (void)kernel_avx2; -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - architecture_name = "AVX2"; - kernel = kernel_avx2; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { - architecture_name = "AVX"; - kernel = kernel_avx; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { - architecture_name = "SSE4.1"; - kernel = kernel_sse41; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { - architecture_name = "SSE3"; - kernel = kernel_sse3; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - architecture_name = "SSE2"; - kernel = kernel_sse2; - } -#else - { - /* Dummy to prevent the architecture if below become - * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - * is not defined. */ - } -#endif - - if (strcmp(architecture_name, logged_architecture) != 0) { - VLOG(1) << "Will be using " << architecture_name << " kernels."; - logged_architecture = architecture_name; - } - } - - inline F operator()() const - { - assert(kernel); - return kernel; - } - - protected: - F kernel; -}; - -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; - - public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs); - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); -}; - -class CPUDevice : public Device { - public: - TaskPool task_pool; - KernelGlobals kernel_globals; - - device_vector<TextureInfo> texture_info; - bool need_texture_info; - -#ifdef WITH_OSL - OSLGlobals osl_globals; -#endif -#ifdef WITH_OPENIMAGEDENOISE - oidn::DeviceRef oidn_device; - oidn::FilterRef oidn_filter; -#endif - thread_spin_lock oidn_task_lock; -#ifdef WITH_EMBREE - RTCScene embree_scene = NULL; - RTCDevice embree_device; -#endif - - bool use_split_kernel; - - DeviceRequestedFeatures requested_features; - - KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel; - KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> - convert_to_half_float_kernel; - KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> - convert_to_byte_kernel; - KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> - shader_kernel; - KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel; - - KernelFunctions<void (*)( - int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)> - filter_divide_shadow_kernel; - KernelFunctions<void (*)( - int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)> - filter_get_feature_kernel; - KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)> - filter_write_feature_kernel; - KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> - filter_detect_outliers_kernel; - KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> - filter_combine_halves_kernel; - - KernelFunctions<void (*)( - int, int, float *, float *, float *, float *, int *, int, int, int, float, float)> - filter_nlm_calc_difference_kernel; - KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void (*)( - int, int, float *, float *, float *, float *, float *, int *, int, int, int)> - filter_nlm_update_output_kernel; - KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel; - - KernelFunctions<void (*)( - float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)> - filter_construct_transform_kernel; - KernelFunctions<void (*)(int, - int, - int, - float *, - float *, - float *, - int *, - float *, - float3 *, - int *, - int *, - int, - int, - int, - int, - bool)> - filter_nlm_construct_gramian_kernel; - KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)> - filter_finalize_kernel; - - KernelFunctions<void (*)(KernelGlobals *, - ccl_constant KernelData *, - ccl_global void *, - int, - ccl_global char *, - int, - int, - int, - int, - int, - int, - int, - int, - ccl_global int *, - int, - ccl_global char *, - ccl_global unsigned int *, - unsigned int, - ccl_global float *)> - data_init_kernel; - unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels; - -#define KERNEL_FUNCTIONS(name) \ - KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ - KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ - KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) - - CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_), - texture_info(this, "__texture_info", MEM_GLOBAL), -#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name)) - REGISTER_KERNEL(path_trace), - REGISTER_KERNEL(convert_to_half_float), - REGISTER_KERNEL(convert_to_byte), - REGISTER_KERNEL(shader), - REGISTER_KERNEL(bake), - REGISTER_KERNEL(filter_divide_shadow), - REGISTER_KERNEL(filter_get_feature), - REGISTER_KERNEL(filter_write_feature), - REGISTER_KERNEL(filter_detect_outliers), - REGISTER_KERNEL(filter_combine_halves), - REGISTER_KERNEL(filter_nlm_calc_difference), - REGISTER_KERNEL(filter_nlm_blur), - REGISTER_KERNEL(filter_nlm_calc_weight), - REGISTER_KERNEL(filter_nlm_update_output), - REGISTER_KERNEL(filter_nlm_normalize), - REGISTER_KERNEL(filter_construct_transform), - REGISTER_KERNEL(filter_nlm_construct_gramian), - REGISTER_KERNEL(filter_finalize), - REGISTER_KERNEL(data_init) -#undef REGISTER_KERNEL - { - if (info.cpu_threads == 0) { - info.cpu_threads = TaskScheduler::num_threads(); - } - -#ifdef WITH_OSL - kernel_globals.osl = &osl_globals; -#endif -#ifdef WITH_EMBREE - embree_device = rtcNewDevice("verbose=0"); -#endif - use_split_kernel = DebugFlags().cpu.split_kernel; - if (use_split_kernel) { - VLOG(1) << "Will be using split kernel."; - } - need_texture_info = false; - -#define REGISTER_SPLIT_KERNEL(name) \ - split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \ - KERNEL_FUNCTIONS(name)) - REGISTER_SPLIT_KERNEL(path_init); - REGISTER_SPLIT_KERNEL(scene_intersect); - REGISTER_SPLIT_KERNEL(lamp_emission); - REGISTER_SPLIT_KERNEL(do_volume); - REGISTER_SPLIT_KERNEL(queue_enqueue); - REGISTER_SPLIT_KERNEL(indirect_background); - REGISTER_SPLIT_KERNEL(shader_setup); - REGISTER_SPLIT_KERNEL(shader_sort); - REGISTER_SPLIT_KERNEL(shader_eval); - REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); - REGISTER_SPLIT_KERNEL(subsurface_scatter); - REGISTER_SPLIT_KERNEL(direct_lighting); - REGISTER_SPLIT_KERNEL(shadow_blocked_ao); - REGISTER_SPLIT_KERNEL(shadow_blocked_dl); - REGISTER_SPLIT_KERNEL(enqueue_inactive); - REGISTER_SPLIT_KERNEL(next_iteration_setup); - REGISTER_SPLIT_KERNEL(indirect_subsurface); - REGISTER_SPLIT_KERNEL(buffer_update); - REGISTER_SPLIT_KERNEL(adaptive_stopping); - REGISTER_SPLIT_KERNEL(adaptive_filter_x); - REGISTER_SPLIT_KERNEL(adaptive_filter_y); - REGISTER_SPLIT_KERNEL(adaptive_adjust_samples); -#undef REGISTER_SPLIT_KERNEL -#undef KERNEL_FUNCTIONS - } - - ~CPUDevice() - { -#ifdef WITH_EMBREE - rtcReleaseDevice(embree_device); -#endif - task_pool.cancel(); - texture_info.free(); - } - - virtual bool show_samples() const override - { - return (info.cpu_threads == 1); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const override - { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; -#ifdef WITH_EMBREE - bvh_layout_mask |= BVH_LAYOUT_EMBREE; -#endif /* WITH_EMBREE */ - return bvh_layout_mask; - } - - void load_texture_info() - { - if (need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - virtual void mem_alloc(device_memory &mem) override - { - if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else if (mem.type == MEM_GLOBAL) { - assert(!"mem_alloc not supported for global memory."); - } - else { - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { - size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; - void *data = util_aligned_malloc(mem.memory_size(), alignment); - mem.device_pointer = (device_ptr)data; - } - else { - mem.device_pointer = (device_ptr)mem.host_pointer; - } - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - } - - virtual void mem_copy_to(device_memory &mem) override - { - if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - /* copy is no-op */ - } - } - - virtual void mem_copy_from( - device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override - { - /* no-op */ - } - - virtual void mem_zero(device_memory &mem) override - { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - if (mem.device_pointer) { - memset((void *)mem.device_pointer, 0, mem.memory_size()); - } - } - - virtual void mem_free(device_memory &mem) override - { - if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else if (mem.device_pointer) { - if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { - util_aligned_free((void *)mem.device_pointer); - } - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override - { - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); - } - - virtual void const_copy_to(const char *name, void *host, size_t size) override - { -#if WITH_EMBREE - if (strcmp(name, "__data") == 0) { - assert(size <= sizeof(KernelData)); - - // Update scene handle (since it is different for each device on multi devices) - KernelData *const data = (KernelData *)host; - data->bvh.scene = embree_scene; - } -#endif - kernel_const_copy(&kernel_globals, name, host, size); - } - - void global_alloc(device_memory &mem) - { - VLOG(1) << "Global memory allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); - - mem.device_pointer = (device_ptr)mem.host_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - - void global_free(device_memory &mem) - { - if (mem.device_pointer) { - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - void tex_alloc(device_texture &mem) - { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - mem.device_pointer = (device_ptr)mem.host_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - const uint slot = mem.slot; - if (slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount of re-allocations. */ - texture_info.resize(slot + 128); - } - - texture_info[slot] = mem.info; - texture_info[slot].data = (uint64_t)mem.host_pointer; - need_texture_info = true; - } - - void tex_free(device_texture &mem) - { - if (mem.device_pointer) { - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - need_texture_info = true; - } - } - - virtual void *osl_memory() override - { -#ifdef WITH_OSL - return &osl_globals; -#else - return NULL; -#endif - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { -#ifdef WITH_EMBREE - if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) { - BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh); - if (refit) { - bvh_embree->refit(progress); - } - else { - bvh_embree->build(progress, &stats, embree_device); - } - - if (bvh->params.top_level) { - embree_scene = bvh_embree->scene; - } - } - else -#endif - Device::build_bvh(bvh, progress, refit); - } - - void thread_run(DeviceTask &task) - { - if (task.type == DeviceTask::RENDER) - thread_render(task); - else if (task.type == DeviceTask::SHADER) - thread_shader(task); - else if (task.type == DeviceTask::FILM_CONVERT) - thread_film_convert(task); - else if (task.type == DeviceTask::DENOISE_BUFFER) - thread_denoise(task); - } - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS); - - int4 rect = task->rect; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int w = align_up(rect.z - rect.x, 4); - int h = rect.w - rect.y; - int stride = task->buffer.stride; - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - - float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; - float *blurDifference = temporary_mem; - float *difference = temporary_mem + task->buffer.pass_stride; - float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride; - - memset(weightAccum, 0, sizeof(float) * w * h); - memset((float *)out_ptr, 0, sizeof(float) * w * h); - - for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { - int dy = i / (2 * r + 1) - r; - int dx = i % (2 * r + 1) - r; - - int local_rect[4] = { - max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, - dy, - (float *)guide_ptr, - (float *)variance_ptr, - NULL, - difference, - local_rect, - w, - channel_offset, - 0, - a, - k_2); - - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - - filter_nlm_update_output_kernel()(dx, - dy, - blurDifference, - (float *)image_ptr, - difference, - (float *)out_ptr, - weightAccum, - local_rect, - channel_offset, - stride, - f); - } - - int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y}; - filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w); - - return true; - } - - bool denoising_construct_transform(DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM); - - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer, - task->tile_info, - x + task->filter_area.x, - y + task->filter_area.y, - y * task->filter_area.z + x, - (float *)task->storage.transform.device_pointer, - (int *)task->storage.rank.device_pointer, - &task->rect.x, - task->buffer.pass_stride, - task->buffer.frame_stride, - task->buffer.use_time, - task->radius, - task->pca_threshold); - } - } - return true; - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - - float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; - float *difference = temporary_mem; - float *blurDifference = temporary_mem + task->buffer.pass_stride; - - int r = task->radius; - int frame_offset = frame * task->buffer.frame_stride; - for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { - int dy = i / (2 * r + 1) - r; - int dx = i % (2 * r + 1) - r; - - int local_rect[4] = {max(0, -dx), - max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, - dy, - (float *)color_ptr, - (float *)color_variance_ptr, - (float *)scale_ptr, - difference, - local_rect, - task->buffer.stride, - task->buffer.pass_stride, - frame_offset, - 1.0f, - task->nlm_k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_calc_weight_kernel()( - blurDifference, difference, local_rect, task->buffer.stride, 4); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_construct_gramian_kernel()(dx, - dy, - task->tile_info->frames[frame], - blurDifference, - (float *)task->buffer.mem.device_pointer, - (float *)task->storage.transform.device_pointer, - (int *)task->storage.rank.device_pointer, - (float *)task->storage.XtWX.device_pointer, - (float3 *)task->storage.XtWY.device_pointer, - local_rect, - &task->reconstruction_state.filter_window.x, - task->buffer.stride, - 4, - task->buffer.pass_stride, - frame_offset, - task->buffer.use_time); - } - - return true; - } - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) - { - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_finalize_kernel()(x, - y, - y * task->filter_area.z + x, - (float *)output_ptr, - (int *)task->storage.rank.device_pointer, - (float *)task->storage.XtWX.device_pointer, - (float3 *)task->storage.XtWY.device_pointer, - &task->reconstruction_state.buffer_params.x, - task->render_buffer.samples); - } - } - return true; - } - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES); - - for (int y = rect.y; y < rect.w; y++) { - for (int x = rect.x; x < rect.z; x++) { - filter_combine_halves_kernel()(x, - y, - (float *)mean_ptr, - (float *)variance_ptr, - (float *)a_ptr, - (float *)b_ptr, - &rect.x, - r); - } - } - return true; - } - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_divide_shadow_kernel()(task->render_buffer.samples, - task->tile_info, - x, - y, - (float *)a_ptr, - (float *)b_ptr, - (float *)sample_variance_ptr, - (float *)sv_variance_ptr, - (float *)buffer_variance_ptr, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_get_feature_kernel()(task->render_buffer.samples, - task->tile_info, - mean_offset, - variance_offset, - x, - y, - (float *)mean_ptr, - (float *)variance_ptr, - scale, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - for (int y = 0; y < task->filter_area.w; y++) { - for (int x = 0; x < task->filter_area.z; x++) { - filter_write_feature_kernel()(task->render_buffer.samples, - x + task->filter_area.x, - y + task->filter_area.y, - &task->reconstruction_state.buffer_params.x, - (float *)from_ptr, - (float *)buffer_ptr, - out_offset, - &task->rect.x); - } - } - return true; - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS); - - for (int y = task->rect.y; y < task->rect.w; y++) { - for (int x = task->rect.x; x < task->rect.z; x++) { - filter_detect_outliers_kernel()(x, - y, - (float *)image_ptr, - (float *)variance_ptr, - (float *)depth_ptr, - (float *)output_ptr, - &task->rect.x, - task->buffer.pass_stride); - } - } - return true; - } - - bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample) - { - WorkTile wtile; - wtile.x = tile.x; - wtile.y = tile.y; - wtile.w = tile.w; - wtile.h = tile.h; - wtile.offset = tile.offset; - wtile.stride = tile.stride; - wtile.buffer = (float *)tile.buffer; - - /* For CPU we do adaptive stopping per sample so we can stop earlier, but - * for combined CPU + GPU rendering we match the GPU and do it per tile - * after a given number of sample steps. */ - if (!kernel_data.integrator.adaptive_stop_per_sample) { - for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { - for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { - const int index = wtile.offset + x + y * wtile.stride; - float *buffer = wtile.buffer + index * kernel_data.film.pass_stride; - kernel_do_adaptive_stopping(kg, buffer, sample); - } - } - } - - bool any = false; - for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { - any |= kernel_do_adaptive_filter_x(kg, y, &wtile); - } - for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { - any |= kernel_do_adaptive_filter_y(kg, x, &wtile); - } - return (!any); - } - - void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg) - { - float *render_buffer = (float *)tile.buffer; - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - int index = tile.offset + x + y * tile.stride; - ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; - if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { - buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count]; - if (sample_multiplier != 1.0f) { - kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); - } - } - else { - kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f)); - } - } - } - } - - void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) - { - const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; - - scoped_timer timer(&tile.buffers->render_time); - - Coverage coverage(kg, tile); - if (use_coverage) { - coverage.init_path_trace(); - } - - float *render_buffer = (float *)tile.buffer; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - /* Needed for Embree. */ - SIMD_SET_FLUSH_TO_ZERO; - - for (int sample = start_sample; sample < end_sample; sample++) { - if (task.get_cancel() || TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - - if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { - tile.stealing_state = RenderTile::WAS_STOLEN; - break; - } - - if (tile.task == RenderTile::PATH_TRACE) { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - if (use_coverage) { - coverage.init_pixel(x, y); - } - path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - else { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - tile.sample = sample + 1; - - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { - const bool stop = adaptive_sampling_filter(kg, tile, sample); - if (stop) { - const int num_progress_samples = end_sample - sample; - tile.sample = end_sample; - task.update_progress(&tile, tile.w * tile.h * num_progress_samples); - break; - } - } - - task.update_progress(&tile, tile.w * tile.h); - } - if (use_coverage) { - coverage.finalize(); - } - - if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { - adaptive_sampling_post(tile, kg); - } - } - - void denoise_openimagedenoise_buffer(DeviceTask &task, - float *buffer, - const size_t offset, - const size_t stride, - const size_t x, - const size_t y, - const size_t w, - const size_t h, - const float scale) - { -#ifdef WITH_OPENIMAGEDENOISE - assert(openimagedenoise_supported()); - - /* Only one at a time, since OpenImageDenoise itself is multithreaded for full - * buffers, and for tiled rendering because creating multiple devices and filters - * is slow and memory hungry as well. - * - * TODO: optimize tiled rendering case, by batching together denoising of many - * tiles somehow? */ - static thread_mutex mutex; - thread_scoped_lock lock(mutex); - - /* Create device and filter, cached for reuse. */ - if (!oidn_device) { - oidn_device = oidn::newDevice(); - oidn_device.commit(); - } - if (!oidn_filter) { - oidn_filter = oidn_device.newFilter("RT"); - oidn_filter.set("hdr", true); - oidn_filter.set("srgb", false); - } - - /* Set images with appropriate stride for our interleaved pass storage. */ - struct { - const char *name; - const int offset; - const bool scale; - const bool use; - array<float> scaled_buffer; - } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true}, - {"albedo", - task.pass_denoising_data + DENOISING_PASS_ALBEDO, - true, - task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO}, - {"normal", - task.pass_denoising_data + DENOISING_PASS_NORMAL, - true, - task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL}, - {"output", 0, false, true}, - { NULL, - 0 }}; - - for (int i = 0; passes[i].name; i++) { - if (!passes[i].use) { - continue; - } - - const int64_t pixel_offset = offset + x + y * stride; - const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset); - const int64_t pixel_stride = task.pass_stride; - const int64_t row_stride = stride * pixel_stride; - - if (passes[i].scale && scale != 1.0f) { - /* Normalize albedo and normal passes as they are scaled by the number of samples. - * For the color passes OIDN will perform auto-exposure making it unnecessary. */ - array<float> &scaled_buffer = passes[i].scaled_buffer; - scaled_buffer.resize(w * h * 3); - - for (int y = 0; y < h; y++) { - const float *pass_row = buffer + buffer_offset + y * row_stride; - float *scaled_row = scaled_buffer.data() + y * w * 3; - - for (int x = 0; x < w; x++) { - scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale; - scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale; - scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale; - } - } - - oidn_filter.setImage( - passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0); - } - else { - oidn_filter.setImage(passes[i].name, - buffer + buffer_offset, - oidn::Format::Float3, - w, - h, - 0, - pixel_stride * sizeof(float), - row_stride * sizeof(float)); - } - } - - /* Execute filter. */ - oidn_filter.commit(); - oidn_filter.execute(); -#else - (void)task; - (void)buffer; - (void)offset; - (void)stride; - (void)x; - (void)y; - (void)w; - (void)h; - (void)scale; -#endif - } - - void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) - { - if (task.type == DeviceTask::DENOISE_BUFFER) { - /* Copy pixels from compute device to CPU (no-op for CPU device). */ - rtile.buffers->buffer.copy_from_device(); - - denoise_openimagedenoise_buffer(task, - (float *)rtile.buffer, - rtile.offset, - rtile.stride, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - 1.0f / rtile.sample); - - /* todo: it may be possible to avoid this copy, but we have to ensure that - * when other code copies data from the device it doesn't overwrite the - * denoiser buffers. */ - rtile.buffers->buffer.copy_to_device(); - } - else { - /* Per-tile denoising. */ - rtile.sample = rtile.start_sample + rtile.num_samples; - const float scale = 1.0f / rtile.sample; - const float invscale = rtile.sample; - const size_t pass_stride = task.pass_stride; - - /* Map neighboring tiles into one buffer for denoising. */ - RenderTileNeighbors neighbors(rtile); - task.map_neighbor_tiles(neighbors, this); - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - rtile = center_tile; - - /* Calculate size of the tile to denoise (including overlap). The overlap - * size was chosen empirically. OpenImageDenoise specifies an overlap size - * of 128 but this is significantly bigger than typical tile size. */ - const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds()); - const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); - - /* Adjacent tiles are in separate memory regions, copy into single buffer. */ - array<float> merged(rect_size.x * rect_size.y * task.pass_stride); - - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &ntile = neighbors.tiles[i]; - if (!ntile.buffer) { - continue; - } - - const int xmin = max(ntile.x, rect.x); - const int ymin = max(ntile.y, rect.y); - const int xmax = min(ntile.x + ntile.w, rect.z); - const int ymax = min(ntile.y + ntile.h, rect.w); - - const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; - const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; - - const size_t merged_stride = rect_size.x; - const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; - float *merged_buffer = merged.data() + merged_offset * pass_stride; - - for (int y = ymin; y < ymax; y++) { - for (int x = 0; x < pass_stride * (xmax - xmin); x++) { - merged_buffer[x] = tile_buffer[x] * scale; - } - tile_buffer += ntile.stride * pass_stride; - merged_buffer += merged_stride * pass_stride; - } - } - - /* Denoise */ - denoise_openimagedenoise_buffer( - task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f); - - /* Copy back result from merged buffer. */ - RenderTile &ntile = neighbors.target; - if (ntile.buffer) { - const int xmin = max(ntile.x, rect.x); - const int ymin = max(ntile.y, rect.y); - const int xmax = min(ntile.x + ntile.w, rect.z); - const int ymax = min(ntile.y + ntile.h, rect.w); - - const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; - float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; - - const size_t merged_stride = rect_size.x; - const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; - const float *merged_buffer = merged.data() + merged_offset * pass_stride; - - for (int y = ymin; y < ymax; y++) { - for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) { - tile_buffer[x + 0] = merged_buffer[x + 0] * invscale; - tile_buffer[x + 1] = merged_buffer[x + 1] * invscale; - tile_buffer[x + 2] = merged_buffer[x + 2] * invscale; - } - tile_buffer += ntile.stride * pass_stride; - merged_buffer += merged_stride * pass_stride; - } - } - - task.unmap_neighbor_tiles(neighbors, this); - } - } - - void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) - { - ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); - - tile.sample = tile.start_sample + tile.num_samples; - - denoising.functions.construct_transform = function_bind( - &CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); - denoising.render_buffer.samples = tile.sample; - denoising.buffer.gpu_temporary_mem = false; - - denoising.run_denoising(tile); - } - - void thread_render(DeviceTask &task) - { - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - return; - } - - /* allocate buffer for kernel globals */ - device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - KernelGlobals *kg = new ((void *)kgbuffer.device_pointer) - KernelGlobals(thread_kernel_globals_init()); - - profiler.add_state(&kg->profiler); - - CPUSplitKernel *split_kernel = NULL; - if (use_split_kernel) { - split_kernel = new CPUSplitKernel(this); - if (!split_kernel->load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); - kgbuffer.free(); - delete split_kernel; - return; - } - } - - /* NLM denoiser. */ - DenoisingTask *denoising = NULL; - - /* OpenImageDenoise: we can only denoise with one thread at a time, so to - * avoid waiting with mutex locks in the denoiser, we let only a single - * thread acquire denoising tiles. */ - uint tile_types = task.tile_types; - bool hold_denoise_lock = false; - if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - if (!oidn_task_lock.try_lock()) { - tile_types &= ~RenderTile::DENOISE; - hold_denoise_lock = true; - } - } - - RenderTile tile; - while (task.acquire_tile(this, tile, tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - if (use_split_kernel) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, kgbuffer, void_buffer); - } - else { - render(task, tile, kg); - } - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, kg); - } - else if (tile.task == RenderTile::DENOISE) { - if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - denoise_openimagedenoise(task, tile); - } - else if (task.denoising.type == DENOISER_NLM) { - if (denoising == NULL) { - denoising = new DenoisingTask(this, task); - denoising->profiler = &kg->profiler; - } - denoise_nlm(*denoising, tile); - } - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - } - - if (hold_denoise_lock) { - oidn_task_lock.unlock(); - } - - profiler.remove_state(&kg->profiler); - - thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); - kg->~KernelGlobals(); - kgbuffer.free(); - delete split_kernel; - delete denoising; - } - - void thread_denoise(DeviceTask &task) - { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - denoise_openimagedenoise(task, tile); - } - else { - DenoisingTask denoising(this, task); - - ProfilingState denoising_profiler_state; - profiler.add_state(&denoising_profiler_state); - denoising.profiler = &denoising_profiler_state; - - denoise_nlm(denoising, tile); - - profiler.remove_state(&denoising_profiler_state); - } - - task.update_progress(&tile, tile.w * tile.h); - } - - void thread_film_convert(DeviceTask &task) - { - float sample_scale = 1.0f / (task.sample + 1); - - if (task.rgba_half) { - for (int y = task.y; y < task.y + task.h; y++) - for (int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel()(&kernel_globals, - (uchar4 *)task.rgba_half, - (float *)task.buffer, - sample_scale, - x, - y, - task.offset, - task.stride); - } - else { - for (int y = task.y; y < task.y + task.h; y++) - for (int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel()(&kernel_globals, - (uchar4 *)task.rgba_byte, - (float *)task.buffer, - sample_scale, - x, - y, - task.offset, - task.stride); - } - } - - void thread_shader(DeviceTask &task) - { - KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init()); - - for (int sample = 0; sample < task.num_samples; sample++) { - for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel()(kg, - (uint4 *)task.shader_input, - (float4 *)task.shader_output, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); - - if (task.get_cancel() || TaskPool::canceled()) - break; - - task.update_progress(NULL); - } - - thread_kernel_globals_free(kg); - delete kg; - } - - virtual int get_split_task_count(DeviceTask &task) override - { - if (task.type == DeviceTask::SHADER) - return task.get_subtask_count(info.cpu_threads, 256); - else - return task.get_subtask_count(info.cpu_threads); - } - - virtual void task_add(DeviceTask &task) override - { - /* Load texture info. */ - load_texture_info(); - - /* split task into smaller ones */ - list<DeviceTask> tasks; - - if (task.type == DeviceTask::DENOISE_BUFFER && - task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - /* Denoise entire buffer at once with OIDN, it has own threading. */ - tasks.push_back(task); - } - else if (task.type == DeviceTask::SHADER) { - task.split(tasks, info.cpu_threads, 256); - } - else { - task.split(tasks, info.cpu_threads); - } - - foreach (DeviceTask &task, tasks) { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } - } - - virtual void task_wait() override - { - task_pool.wait_work(); - } - - virtual void task_cancel() override - { - task_pool.cancel(); - } - - protected: - inline KernelGlobals thread_kernel_globals_init() - { - KernelGlobals kg = kernel_globals; - kg.transparent_shadow_intersections = NULL; - const int decoupled_count = sizeof(kg.decoupled_volume_steps) / - sizeof(*kg.decoupled_volume_steps); - for (int i = 0; i < decoupled_count; ++i) { - kg.decoupled_volume_steps[i] = NULL; - } - kg.decoupled_volume_steps_index = 0; - kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL; -#ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); -#endif - return kg; - } - - inline void thread_kernel_globals_free(KernelGlobals *kg) - { - if (kg == NULL) { - return; - } - - if (kg->transparent_shadow_intersections != NULL) { - free(kg->transparent_shadow_intersections); - } - const int decoupled_count = sizeof(kg->decoupled_volume_steps) / - sizeof(*kg->decoupled_volume_steps); - for (int i = 0; i < decoupled_count; ++i) { - if (kg->decoupled_volume_steps[i] != NULL) { - free(kg->decoupled_volume_steps[i]); - } - } -#ifdef WITH_OSL - OSLShader::thread_free(kg); -#endif - } - - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override - { - requested_features = requested_features_; - - return true; - } -}; - -/* split kernel */ - -class CPUSplitKernelFunction : public SplitKernelFunction { - public: - CPUDevice *device; - void (*func)(KernelGlobals *kg, KernelData *data); - - CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL) - { - } - ~CPUSplitKernelFunction() - { - } - - virtual bool enqueue(const KernelDimensions &dim, - device_memory &kernel_globals, - device_memory &data) - { - if (!func) { - return false; - } - - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for (int y = 0; y < dim.global_size[1]; y++) { - for (int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - func(kg, (KernelData *)data.device_pointer); - } - } - - return true; - } -}; - -CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) -{ -} - -bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &data, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flags, - device_memory &work_pool_wgs) -{ - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for (int y = 0; y < dim.global_size[1]; y++) { - for (int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer, - (KernelData *)data.device_pointer, - (void *)split_data.device_pointer, - num_global_elements, - (char *)ray_state.device_pointer, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int *)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char *)use_queues_flags.device_pointer, - (uint *)work_pool_wgs.device_pointer, - rtile.num_samples, - (float *)rtile.buffer); - } - } - - return true; -} - -SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) -{ - CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - - kernel->func = device->split_kernels[kernel_name](); - if (!kernel->func) { - delete kernel; - return NULL; - } - - return kernel; -} - -int2 CPUSplitKernel::split_kernel_local_size() -{ - return make_int2(1, 1); -} - -int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, - device_memory & /*data*/, - DeviceTask & /*task*/) -{ - return make_int2(1, 1); -} - -uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals, - device_memory & /*data*/, - size_t num_threads) -{ - KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - - return split_data_buffer_size(kg, num_threads); -} - -Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new CPUDevice(info, stats, profiler, background); -} - -void device_cpu_info(vector<DeviceInfo> &devices) -{ - DeviceInfo info; - - info.type = DEVICE_CPU; - info.description = system_cpu_brand_string(); - info.id = "CPU"; - info.num = 0; - info.has_volume_decoupled = true; - info.has_adaptive_stop_per_sample = true; - info.has_osl = true; - info.has_half_images = true; - info.has_nanovdb = true; - info.has_profiling = true; - info.denoisers = DENOISER_NLM; - if (openimagedenoise_supported()) { - info.denoisers |= DENOISER_OPENIMAGEDENOISE; - } - - devices.insert(devices.begin(), info); -} - -string device_cpu_capabilities() -{ - string capabilities = ""; - capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; - capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; - capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; - capabilities += system_cpu_support_avx() ? "AVX " : ""; - capabilities += system_cpu_support_avx2() ? "AVX2" : ""; - if (capabilities[capabilities.size() - 1] == ' ') - capabilities.resize(capabilities.size() - 1); - return capabilities; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp new file mode 100644 index 00000000000..aea7868f65d --- /dev/null +++ b/intern/cycles/device/device_denoise.cpp @@ -0,0 +1,88 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoise.h" + +CCL_NAMESPACE_BEGIN + +const char *denoiserTypeToHumanReadable(DenoiserType type) +{ + switch (type) { + case DENOISER_OPTIX: + return "OptiX"; + case DENOISER_OPENIMAGEDENOISE: + return "OpenImageDenoise"; + + case DENOISER_NUM: + case DENOISER_NONE: + case DENOISER_ALL: + return "UNKNOWN"; + } + + return "UNKNOWN"; +} + +const NodeEnum *DenoiseParams::get_type_enum() +{ + static NodeEnum type_enum; + + if (type_enum.empty()) { + type_enum.insert("optix", DENOISER_OPTIX); + type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE); + } + + return &type_enum; +} + +const NodeEnum *DenoiseParams::get_prefilter_enum() +{ + static NodeEnum prefilter_enum; + + if (prefilter_enum.empty()) { + prefilter_enum.insert("none", DENOISER_PREFILTER_NONE); + prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST); + prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE); + } + + return &prefilter_enum; +} + +NODE_DEFINE(DenoiseParams) +{ + NodeType *type = NodeType::add("denoise_params", create); + + const NodeEnum *type_enum = get_type_enum(); + const NodeEnum *prefilter_enum = get_prefilter_enum(); + + SOCKET_BOOLEAN(use, "Use", false); + + SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE); + + SOCKET_INT(start_sample, "Start Sample", 0); + + SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true); + SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false); + + SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST); + + return type; +} + +DenoiseParams::DenoiseParams() : Node(get_node_type()) +{ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h new file mode 100644 index 00000000000..02ee63fb0ad --- /dev/null +++ b/intern/cycles/device/device_denoise.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_memory.h" +#include "graph/node.h" +#include "render/buffers.h" + +CCL_NAMESPACE_BEGIN + +enum DenoiserType { + DENOISER_OPTIX = 2, + DENOISER_OPENIMAGEDENOISE = 4, + DENOISER_NUM, + + DENOISER_NONE = 0, + DENOISER_ALL = ~0, +}; + +/* COnstruct human-readable string which denotes the denoiser type. */ +const char *denoiserTypeToHumanReadable(DenoiserType type); + +typedef int DenoiserTypeMask; + +enum DenoiserPrefilter { + /* Best quality of the result without extra processing time, but requires guiding passes to be + * noise-free. */ + DENOISER_PREFILTER_NONE = 1, + + /* Denoise color and guiding passes together. + * Improves quality when guiding passes are noisy using least amount of extra processing time. */ + DENOISER_PREFILTER_FAST = 2, + + /* Prefilter noisy guiding passes before denoising color. + * Improves quality when guiding passes are noisy using extra processing time. */ + DENOISER_PREFILTER_ACCURATE = 3, + + DENOISER_PREFILTER_NUM, +}; + +/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. + * The default values here do not really matter as they are always initialized from the + * Integrator node. */ +class DenoiseParams : public Node { + public: + NODE_DECLARE + + /* Apply denoiser to image. */ + bool use = false; + + /* Denoiser type. */ + DenoiserType type = DENOISER_OPENIMAGEDENOISE; + + /* Viewport start sample. */ + int start_sample = 0; + + /* Auxiliry passes. */ + bool use_pass_albedo = true; + bool use_pass_normal = true; + + DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST; + + static const NodeEnum *get_type_enum(); + static const NodeEnum *get_prefilter_enum(); + + DenoiseParams(); + + bool modified(const DenoiseParams &other) const + { + return !(use == other.use && type == other.type && start_sample == other.start_sample && + use_pass_albedo == other.use_pass_albedo && + use_pass_normal == other.use_pass_normal && prefilter == other.prefilter); + } +}; + +/* All the parameters needed to perform buffer denoising on a device. + * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is + * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a + * single place where they are all listed, so that it's not required to modify all device methods + * when these parameters do change. */ +class DeviceDenoiseTask { + public: + DenoiseParams params; + + int num_samples; + + RenderBuffers *render_buffers; + BufferParams buffer_params; + + /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will + * lower the memory footprint of the denoiser but will make input passes "invalid" (from path + * tracer) point of view. */ + bool allow_inplace_modification; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp deleted file mode 100644 index 38c42d15cab..00000000000 --- a/intern/cycles/device/device_denoising.cpp +++ /dev/null @@ -1,353 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_denoising.h" - -#include "kernel/filter/filter_defines.h" - -CCL_NAMESPACE_BEGIN - -DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) - : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), - profiler(NULL), - storage(device), - buffer(device), - device(device) -{ - radius = task.denoising.radius; - nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength)); - if (task.denoising.relative_pca) { - pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength)); - } - else { - pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength)); - } - - render_buffer.frame_stride = task.frame_stride; - render_buffer.pass_stride = task.pass_stride; - render_buffer.offset = task.pass_denoising_data; - - target_buffer.pass_stride = task.target_pass_stride; - target_buffer.denoising_clean_offset = task.pass_denoising_clean; - target_buffer.offset = 0; - - functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); - functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); - - tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int)); - tile_info->from_render = task.denoising_from_render ? 1 : 0; - - tile_info->frames[0] = 0; - tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); - for (int i = 1; i < tile_info->num_frames; i++) { - tile_info->frames[i] = task.denoising_frames[i - 1]; - } - - do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM; - do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM; -} - -DenoisingTask::~DenoisingTask() -{ - storage.XtWX.free(); - storage.XtWY.free(); - storage.transform.free(); - storage.rank.free(); - buffer.mem.free(); - buffer.temporary_mem.free(); - tile_info_mem.free(); -} - -void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors) -{ - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &rtile = neighbors.tiles[i]; - tile_info->offsets[i] = rtile.offset; - tile_info->strides[i] = rtile.stride; - tile_info->buffers[i] = rtile.buffer; - } - tile_info->x[0] = neighbors.tiles[3].x; - tile_info->x[1] = neighbors.tiles[4].x; - tile_info->x[2] = neighbors.tiles[5].x; - tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; - tile_info->y[0] = neighbors.tiles[1].y; - tile_info->y[1] = neighbors.tiles[4].y; - tile_info->y[2] = neighbors.tiles[7].y; - tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; - - target_buffer.offset = neighbors.target.offset; - target_buffer.stride = neighbors.target.stride; - target_buffer.ptr = neighbors.target.buffer; - - if (do_prefilter && neighbors.target.buffers) { - target_buffer.denoising_output_offset = - neighbors.target.buffers->params.get_denoising_prefiltered_offset(); - } - else { - target_buffer.denoising_output_offset = 0; - } - - tile_info_mem.copy_to_device(); -} - -void DenoisingTask::setup_denoising_buffer() -{ - /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring - * tiles */ - rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); - rect = rect_expand(rect, radius); - rect = rect_clip(rect, - make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - - buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1); - buffer.passes = buffer.use_intensity ? 15 : 14; - buffer.width = rect.z - rect.x; - buffer.stride = align_up(buffer.width, 4); - buffer.h = rect.w - rect.y; - int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); - buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); - buffer.frame_stride = buffer.pass_stride * buffer.passes; - /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); - buffer.mem.alloc_to_device(mem_size, false); - buffer.use_time = (tile_info->num_frames > 1); - - /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_layers; - if (buffer.gpu_temporary_mem) { - /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ - int max_radius = max(radius, 6); - int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1); - num_layers = 2 * num_shifts + 1; - } - else { - num_layers = 3; - } - /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); -} - -void DenoisingTask::prefilter_shadowing() -{ - device_ptr null_ptr = (device_ptr)0; - - device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride); - device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride); - - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the - * sample variance and the buffer variance. */ - functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the - * sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); - functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); - - /* Reuse memory, the previous data isn't needed anymore. */ - device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; - /* Use the smoothed variance to filter the two shadow half images using each other for weight - * calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); - functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); - - device_ptr residual_var = *sample_var_var; - /* Estimate the residual variance between the two filtered halves. */ - functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); - - device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; - /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); - functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); - functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); - - /* Combine the two double-filtered halves to a final shadow feature. */ - device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride); - functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); -} - -void DenoisingTask::prefilter_features() -{ - device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride); - - int mean_from[] = {0, 1, 2, 12, 6, 7, 8}; - int variance_from[] = {3, 4, 5, 13, 9, 10, 11}; - int pass_to[] = {1, 2, 3, 0, 5, 6, 7}; - for (int pass = 0; pass < 7; pass++) { - device_sub_ptr feature_pass( - buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride); - /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], - variance_from[pass], - *unfiltered, - *variance, - 1.0f / render_buffer.samples); - /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); - } -} - -void DenoisingTask::prefilter_color() -{ - int mean_from[] = {20, 21, 22}; - int variance_from[] = {23, 24, 25}; - int mean_to[] = {8, 9, 10}; - int variance_to[] = {11, 12, 13}; - int num_color_passes = 3; - - device_only_memory<float> temporary_color(device, "denoising temporary color"); - temporary_color.alloc_to_device(6 * buffer.pass_stride, false); - - for (int pass = 0; pass < num_color_passes; pass++) { - device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride); - device_sub_ptr color_var_pass( - temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], - variance_from[pass], - *color_pass, - *color_var_pass, - 1.0f / render_buffer.samples); - } - - device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride); - device_sub_ptr color_var_pass( - buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); - device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); - functions.detect_outliers( - temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); - - if (buffer.use_intensity) { - device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); - nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true); - functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); - } -} - -void DenoisingTask::load_buffer() -{ - device_ptr null_ptr = (device_ptr)0; - - int original_offset = render_buffer.offset; - - int num_passes = buffer.use_intensity ? 15 : 14; - for (int i = 0; i < tile_info->num_frames; i++) { - for (int pass = 0; pass < num_passes; pass++) { - device_sub_ptr to_pass( - buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride); - bool is_variance = (pass >= 11) && (pass <= 13); - functions.get_feature( - pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f); - } - render_buffer.offset += render_buffer.frame_stride; - } - - render_buffer.offset = original_offset; -} - -void DenoisingTask::write_buffer() -{ - reconstruction_state.buffer_params = make_int4(target_buffer.offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - int num_passes = buffer.use_intensity ? 15 : 14; - for (int pass = 0; pass < num_passes; pass++) { - device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride); - int out_offset = pass + target_buffer.denoising_output_offset; - functions.write_feature(out_offset, *from_pass, target_buffer.ptr); - } -} - -void DenoisingTask::construct_transform() -{ - storage.w = filter_area.z; - storage.h = filter_area.w; - - storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false); - storage.rank.alloc_to_device(storage.w * storage.h, false); - - functions.construct_transform(); -} - -void DenoisingTask::reconstruct() -{ - storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false); - storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false); - storage.XtWX.zero_to_device(); - storage.XtWY.zero_to_device(); - - reconstruction_state.filter_window = rect_from_shape( - filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h); - int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x; - reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - reconstruction_state.source_w = rect.z - rect.x; - reconstruction_state.source_h = rect.w - rect.y; - - device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride); - device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride); - for (int f = 0; f < tile_info->num_frames; f++) { - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { - scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); - scale_ptr = **scale_sub_ptr; - } - - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); - delete scale_sub_ptr; - } - functions.solve(target_buffer.ptr); -} - -void DenoisingTask::run_denoising(RenderTile &tile) -{ - RenderTileNeighbors neighbors(tile); - functions.map_neighbor_tiles(neighbors); - set_render_buffer(neighbors); - - setup_denoising_buffer(); - - if (tile_info->from_render) { - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); - } - else { - load_buffer(); - } - - if (do_filter) { - construct_transform(); - reconstruct(); - } - - if (do_prefilter) { - write_buffer(); - } - - functions.unmap_neighbor_tiles(neighbors); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h deleted file mode 100644 index bb8bdfdd225..00000000000 --- a/intern/cycles/device/device_denoising.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_DENOISING_H__ -#define __DEVICE_DENOISING_H__ - -#include "device/device.h" - -#include "render/buffers.h" - -#include "kernel/filter/filter_defines.h" - -#include "util/util_profiling.h" - -CCL_NAMESPACE_BEGIN - -class DenoisingTask { - public: - /* Parameters of the denoising algorithm. */ - int radius; - float nlm_k_2; - float pca_threshold; - - /* Parameters of the RenderBuffers. */ - struct RenderBuffers { - int offset; - int pass_stride; - int frame_stride; - int samples; - } render_buffer; - - /* Pointer and parameters of the target buffer. */ - struct TargetBuffer { - int offset; - int stride; - int pass_stride; - int denoising_clean_offset; - int denoising_output_offset; - device_ptr ptr; - } target_buffer; - - TileInfo *tile_info; - device_vector<int> tile_info_mem; - - ProfilingState *profiler; - - int4 rect; - int4 filter_area; - - bool do_prefilter; - bool do_filter; - - struct DeviceFunctions { - function<bool( - device_ptr image_ptr, /* Contains the values that are smoothed. */ - device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ - device_ptr variance_ptr, /* Contains the variance of the guide image. */ - device_ptr out_ptr /* The filtered output is written into this image. */ - )> - non_local_means; - function<bool( - device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> - accumulate; - function<bool(device_ptr output_ptr)> solve; - function<bool()> construct_transform; - - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect)> - combine_halves; - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr)> - divide_shadow; - function<bool(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale)> - get_feature; - function<bool(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr)> - detect_outliers; - function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature; - function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles; - function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles; - } functions; - - /* Stores state of the current Reconstruction operation, - * which is accessed by the device in order to perform the operation. */ - struct ReconstructionState { - int4 filter_window; - int4 buffer_params; - - int source_w; - int source_h; - } reconstruction_state; - - /* Stores state of the current NLM operation, - * which is accessed by the device in order to perform the operation. */ - struct NLMState { - int r; /* Search radius of the filter. */ - int f; /* Patch size of the filter. */ - float a; /* Variance compensation factor in the MSE estimation. */ - float k_2; /* Squared value of the k parameter of the filter. */ - bool is_color; - - void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) - { - r = r_; - f = f_; - a = a_, k_2 = k_2_; - is_color = is_color_; - } - } nlm_state; - - struct Storage { - device_only_memory<float> transform; - device_only_memory<int> rank; - device_only_memory<float> XtWX; - device_only_memory<float3> XtWY; - int w; - int h; - - Storage(Device *device) - : transform(device, "denoising transform"), - rank(device, "denoising rank"), - XtWX(device, "denoising XtWX"), - XtWY(device, "denoising XtWY") - { - } - } storage; - - DenoisingTask(Device *device, const DeviceTask &task); - ~DenoisingTask(); - - void run_denoising(RenderTile &tile); - - struct DenoiseBuffers { - int pass_stride; - int passes; - int stride; - int h; - int width; - int frame_stride; - device_only_memory<float> mem; - device_only_memory<float> temporary_mem; - bool use_time; - bool use_intensity; - - bool gpu_temporary_mem; - - DenoiseBuffers(Device *device) - : mem(device, "denoising pixel buffer"), - temporary_mem(device, "denoising temporary mem", true) - { - } - } buffer; - - protected: - Device *device; - - void set_render_buffer(RenderTileNeighbors &neighbors); - void setup_denoising_buffer(); - void prefilter_shadowing(); - void prefilter_features(); - void prefilter_color(); - void construct_transform(); - void reconstruct(); - - void load_buffer(); - void write_buffer(); -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/device/device_graphics_interop.cpp index fa210e747c0..a80a236759f 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl +++ b/intern/cycles/device/device_graphics_interop.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,8 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_path_init.h" +#include "device/device_graphics_interop.h" -#define KERNEL_NAME path_init -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME +CCL_NAMESPACE_BEGIN +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h new file mode 100644 index 00000000000..671b1c189d7 --- /dev/null +++ b/intern/cycles/device/device_graphics_interop.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Information about interoperability destination. + * Is provided by the GPUDisplay. */ +class DeviceGraphicsInteropDestination { + public: + /* Dimensions of the buffer, in pixels. */ + int buffer_width = 0; + int buffer_height = 0; + + /* OpenGL pixel buffer object. */ + int opengl_pbo_id = 0; + + /* Clear the entire destination before doing partial write to it. */ + bool need_clear = false; +}; + +/* Device-side graphics interoperability support. + * + * Takes care of holding all the handlers needed by the device to implement interoperability with + * the graphics library. */ +class DeviceGraphicsInterop { + public: + DeviceGraphicsInterop() = default; + virtual ~DeviceGraphicsInterop() = default; + + /* Update this device-side graphics interoperability object with the given destination resource + * information. */ + virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0; + + virtual device_ptr map() = 0; + virtual void unmap() = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h deleted file mode 100644 index ecc79c5d7ee..00000000000 --- a/intern/cycles/device/device_intern.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_INTERN_H__ -#define __DEVICE_INTERN_H__ - -#include "util/util_string.h" -#include "util/util_vector.h" - -CCL_NAMESPACE_BEGIN - -class Device; -class DeviceInfo; -class Profiler; -class Stats; - -Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_opencl_init(); -Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_opencl_compile_kernel(const vector<string> ¶meters); -bool device_cuda_init(); -Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -bool device_optix_init(); -Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); -Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - -Device *device_network_create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - const char *address); -Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - -void device_cpu_info(vector<DeviceInfo> &devices); -void device_opencl_info(vector<DeviceInfo> &devices); -void device_cuda_info(vector<DeviceInfo> &devices); -void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices); -void device_network_info(vector<DeviceInfo> &devices); - -string device_cpu_capabilities(); -string device_opencl_capabilities(); -string device_cuda_capabilities(); - -CCL_NAMESPACE_END - -#endif /* __DEVICE_INTERN_H__ */ diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp new file mode 100644 index 00000000000..ceaddee4756 --- /dev/null +++ b/intern/cycles/device/device_kernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_kernel.h" + +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +const char *device_kernel_as_string(DeviceKernel kernel) +{ + switch (kernel) { + /* Integrator. */ + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: + return "integrator_init_from_camera"; + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: + return "integrator_init_from_bake"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + return "integrator_intersect_closest"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + return "integrator_intersect_shadow"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + return "integrator_intersect_subsurface"; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + return "integrator_intersect_volume_stack"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + return "integrator_shade_background"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + return "integrator_shade_light"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + return "integrator_shade_shadow"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + return "integrator_shade_surface"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + return "integrator_shade_surface_raytrace"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + return "integrator_shade_volume"; + case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: + return "integrator_megakernel"; + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + return "integrator_queued_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + return "integrator_queued_shadow_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + return "integrator_active_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + return "integrator_terminated_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + return "integrator_sorted_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + return "integrator_compact_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: + return "integrator_compact_states"; + case DEVICE_KERNEL_INTEGRATOR_RESET: + return "integrator_reset"; + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: + return "integrator_shadow_catcher_count_possible_splits"; + + /* Shader evaluation. */ + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: + return "shader_eval_displace"; + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: + return "shader_eval_background"; + + /* Film. */ + +#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \ + case DEVICE_KERNEL_FILM_CONVERT_##variant: \ + return "film_convert_" #variant_lowercase; \ + case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \ + return "film_convert_" #variant_lowercase "_half_rgba"; + + FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth) + FILM_CONVERT_KERNEL_AS_STRING(MIST, mist) + FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float) + FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3) + FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion) + FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte) + FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher) + FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW, + shadow_catcher_matte_with_shadow) + FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined) + FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4) + +#undef FILM_CONVERT_KERNEL_AS_STRING + + /* Adaptive sampling. */ + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: + return "adaptive_sampling_convergence_check"; + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: + return "adaptive_sampling_filter_x"; + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: + return "adaptive_sampling_filter_y"; + + /* Denoising. */ + case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: + return "filter_guiding_preprocess"; + case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: + return "filter_guiding_set_fake_albedo"; + case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: + return "filter_color_preprocess"; + case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: + return "filter_color_postprocess"; + + /* Cryptomatte. */ + case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: + return "cryptomatte_postprocess"; + + /* Generic */ + case DEVICE_KERNEL_PREFIX_SUM: + return "prefix_sum"; + + case DEVICE_KERNEL_NUM: + break; + }; + LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen."; + return "UNKNOWN"; +} + +std::ostream &operator<<(std::ostream &os, DeviceKernel kernel) +{ + os << device_kernel_as_string(kernel); + return os; +} + +string device_kernel_mask_as_string(DeviceKernelMask mask) +{ + string str; + + for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) { + if (mask & (uint64_t(1) << i)) { + if (!str.empty()) { + str += " "; + } + str += device_kernel_as_string((DeviceKernel)i); + } + } + + return str; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/device/device_kernel.h index 9e1e57beba6..83d959ca87b 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/device/device_kernel.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,20 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#pragma once -#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao -#define LOCALS_TYPE BackgroundAOLocals -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE +#include "kernel/kernel_types.h" +#include "util/util_string.h" + +#include <ostream> // NOLINT + +CCL_NAMESPACE_BEGIN + +const char *device_kernel_as_string(DeviceKernel kernel); +std::ostream &operator<<(std::ostream &os, DeviceKernel kernel); + +typedef uint64_t DeviceKernelMask; +string device_kernel_mask_as_string(DeviceKernelMask mask); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index 80a05fc32fe..c4d45829b83 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN device_memory::device_memory(Device *device, const char *name, MemoryType type) : data_type(device_type_traits<uchar>::data_type), - data_elements(device_type_traits<uchar>::num_elements), + data_elements(device_type_traits<uchar>::num_elements_cpu), data_size(0), device_size(0), data_width(0), @@ -149,6 +149,11 @@ void device_memory::device_zero() } } +bool device_memory::device_is_cpu() +{ + return (device->info.type == DEVICE_CPU); +} + void device_memory::swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 80f4d7b0468..c51594b8580 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -38,7 +38,6 @@ enum MemoryType { MEM_DEVICE_ONLY, MEM_GLOBAL, MEM_TEXTURE, - MEM_PIXELS }; /* Supported Data Types */ @@ -54,7 +53,7 @@ enum DataType { TYPE_UINT64, }; -static inline size_t datatype_size(DataType datatype) +static constexpr size_t datatype_size(DataType datatype) { switch (datatype) { case TYPE_UNKNOWN: @@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype) template<typename T> struct device_type_traits { static const DataType data_type = TYPE_UNKNOWN; - static const int num_elements = sizeof(T); + static const int num_elements_cpu = sizeof(T); + static const int num_elements_gpu = sizeof(T); }; template<> struct device_type_traits<uchar> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar2> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar3> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 3; + static const int num_elements_cpu = 3; + static const int num_elements_gpu = 3; + static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar4> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint2> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint3> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 3; + static const int num_elements_cpu = 3; + static const int num_elements_gpu = 3; + static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint4> { static const DataType data_type = TYPE_UINT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int> { static const DataType data_type = TYPE_INT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int2> { static const DataType data_type = TYPE_INT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int3> { static const DataType data_type = TYPE_INT; - static const int num_elements = 3; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 3; + static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int4> { static const DataType data_type = TYPE_INT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float2> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 2; + static const int num_elements_cpu = 2; + static const int num_elements_gpu = 2; + static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 3; + static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float4> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half> { static const DataType data_type = TYPE_HALF; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<ushort4> { static const DataType data_type = TYPE_UINT16; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint16_t> { static const DataType data_type = TYPE_UINT16; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half4> { static const DataType data_type = TYPE_HALF; - static const int num_elements = 4; + static const int num_elements_cpu = 4; + static const int num_elements_gpu = 4; + static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint64_t> { static const DataType data_type = TYPE_UINT64; - static const int num_elements = 1; + static const int num_elements_cpu = 1; + static const int num_elements_gpu = 1; + static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type)); }; /* Device Memory @@ -257,6 +299,8 @@ class device_memory { void device_copy_from(int y, int w, int h, int elem); void device_zero(); + bool device_is_cpu(); + device_ptr original_device_ptr; size_t original_device_size; Device *original_device; @@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory { : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY) { data_type = device_type_traits<T>::data_type; - data_elements = max(device_type_traits<T>::num_elements, 1); + data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu : + device_type_traits<T>::num_elements_gpu, + 1); } device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other)) @@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory { template<typename T> class device_vector : public device_memory { public: + /* Can only use this for types that have the same size on CPU and GPU. */ + static_assert(device_type_traits<T>::num_elements_cpu == + device_type_traits<T>::num_elements_gpu); + device_vector(Device *device, const char *name, MemoryType type) : device_memory(device, name, type) { data_type = device_type_traits<T>::data_type; - data_elements = device_type_traits<T>::num_elements; + data_elements = device_type_traits<T>::num_elements_cpu; modified = true; need_realloc_ = true; @@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory { return (T *)host_pointer; } + const T *data() const + { + return (T *)host_pointer; + } + T &operator[](size_t i) { assert(i < data_size); @@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory { void copy_from_device() { - device_copy_from(0, data_width, data_height, sizeof(T)); + device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T)); } void copy_from_device(int y, int w, int h) @@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory { } }; -/* Pixel Memory - * - * Device memory to efficiently draw as pixels to the screen in interactive - * rendering. Only copying pixels from the device is supported, not copying to. */ - -template<typename T> class device_pixels : public device_vector<T> { - public: - device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS) - { - } - - void alloc_to_device(size_t width, size_t height, size_t depth = 0) - { - device_vector<T>::alloc(width, height, depth); - - if (!device_memory::device_pointer) { - device_memory::device_alloc(); - } - } - - T *copy_from_device(int y, int w, int h) - { - device_memory::device_copy_from(y, w, h, sizeof(T)); - return device_vector<T>::data(); - } -}; - /* Device Sub Memory * * Pointer into existing memory. It is not allocated separately, but created diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp deleted file mode 100644 index 85ffa5fcd52..00000000000 --- a/intern/cycles/device/device_multi.cpp +++ /dev/null @@ -1,826 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <sstream> -#include <stdlib.h> - -#include "bvh/bvh_multi.h" - -#include "device/device.h" -#include "device/device_intern.h" -#include "device/device_network.h" - -#include "render/buffers.h" -#include "render/geometry.h" - -#include "util/util_foreach.h" -#include "util/util_list.h" -#include "util/util_logging.h" -#include "util/util_map.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -class MultiDevice : public Device { - public: - struct SubDevice { - Stats stats; - Device *device; - map<device_ptr, device_ptr> ptr_map; - int peer_island_index = -1; - }; - - list<SubDevice> devices, denoising_devices; - device_ptr unique_key; - vector<vector<SubDevice *>> peer_islands; - bool use_denoising; - bool matching_rendering_and_denoising_devices; - - MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), - unique_key(1), - use_denoising(!info.denoising_devices.empty()) - { - foreach (DeviceInfo &subinfo, info.multi_devices) { - /* Always add CPU devices at the back since GPU devices can change - * host memory pointers, which CPU uses as device pointer. */ - SubDevice *sub; - if (subinfo.type == DEVICE_CPU) { - devices.emplace_back(); - sub = &devices.back(); - } - else { - devices.emplace_front(); - sub = &devices.front(); - } - - /* The pointer to 'sub->stats' will stay valid even after new devices - * are added, since 'devices' is a linked list. */ - sub->device = Device::create(subinfo, sub->stats, profiler, background); - } - - foreach (DeviceInfo &subinfo, info.denoising_devices) { - denoising_devices.emplace_front(); - SubDevice *sub = &denoising_devices.front(); - - sub->device = Device::create(subinfo, sub->stats, profiler, background); - } - - /* Build a list of peer islands for the available render devices */ - foreach (SubDevice &sub, devices) { - /* First ensure that every device is in at least once peer island */ - if (sub.peer_island_index < 0) { - peer_islands.emplace_back(); - sub.peer_island_index = (int)peer_islands.size() - 1; - peer_islands[sub.peer_island_index].push_back(&sub); - } - - if (!info.has_peer_memory) { - continue; - } - - /* Second check peer access between devices and fill up the islands accordingly */ - foreach (SubDevice &peer_sub, devices) { - if (peer_sub.peer_island_index < 0 && - peer_sub.device->info.type == sub.device->info.type && - peer_sub.device->check_peer_access(sub.device)) { - peer_sub.peer_island_index = sub.peer_island_index; - peer_islands[sub.peer_island_index].push_back(&peer_sub); - } - } - } - - /* Try to re-use memory when denoising and render devices use the same physical devices - * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU). - * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */ - matching_rendering_and_denoising_devices = denoising_devices.empty() || - (devices.size() == denoising_devices.size()); - if (matching_rendering_and_denoising_devices) { - for (list<SubDevice>::iterator device_it = devices.begin(), - denoising_device_it = denoising_devices.begin(); - device_it != devices.end() && denoising_device_it != denoising_devices.end(); - ++device_it, ++denoising_device_it) { - const DeviceInfo &info = device_it->device->info; - const DeviceInfo &denoising_info = denoising_device_it->device->info; - if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) || - (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) || - info.num != denoising_info.num) { - matching_rendering_and_denoising_devices = false; - break; - } - } - } - -#ifdef WITH_NETWORK - /* try to add network devices */ - ServerDiscovery discovery(true); - time_sleep(1.0); - - vector<string> servers = discovery.get_server_list(); - - foreach (string &server, servers) { - Device *device = device_network_create(info, stats, profiler, server.c_str()); - if (device) - devices.push_back(SubDevice(device)); - } -#endif - } - - ~MultiDevice() - { - foreach (SubDevice &sub, devices) - delete sub.device; - foreach (SubDevice &sub, denoising_devices) - delete sub.device; - } - - const string &error_message() override - { - error_msg.clear(); - - foreach (SubDevice &sub, devices) - error_msg += sub.device->error_message(); - foreach (SubDevice &sub, denoising_devices) - error_msg += sub.device->error_message(); - - return error_msg; - } - - virtual bool show_samples() const override - { - if (devices.size() > 1) { - return false; - } - return devices.front().device->show_samples(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const override - { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; - BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE; - foreach (const SubDevice &sub_device, devices) { - BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(); - bvh_layout_mask &= device_bvh_layout_mask; - bvh_layout_mask_all |= device_bvh_layout_mask; - } - - /* With multiple OptiX devices, every device needs its own acceleration structure */ - if (bvh_layout_mask == BVH_LAYOUT_OPTIX) { - return BVH_LAYOUT_MULTI_OPTIX; - } - - /* When devices do not share a common BVH layout, fall back to creating one for each */ - const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); - if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { - return BVH_LAYOUT_MULTI_OPTIX_EMBREE; - } - - return bvh_layout_mask; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) override - { - foreach (SubDevice &sub, devices) - if (!sub.device->load_kernels(requested_features)) - return false; - - use_denoising = requested_features.use_denoising; - if (requested_features.use_denoising) { - /* Only need denoising feature, everything else is unused. */ - DeviceRequestedFeatures denoising_features; - denoising_features.use_denoising = true; - foreach (SubDevice &sub, denoising_devices) - if (!sub.device->load_kernels(denoising_features)) - return false; - } - - return true; - } - - bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override - { - foreach (SubDevice &sub, devices) - if (!sub.device->wait_for_availability(requested_features)) - return false; - - if (requested_features.use_denoising) { - foreach (SubDevice &sub, denoising_devices) - if (!sub.device->wait_for_availability(requested_features)) - return false; - } - - return true; - } - - DeviceKernelStatus get_active_kernel_switch_state() override - { - DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL; - - foreach (SubDevice &sub, devices) { - DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state(); - switch (subresult) { - case DEVICE_KERNEL_FEATURE_KERNEL_INVALID: - case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE: - return subresult; - - case DEVICE_KERNEL_USING_FEATURE_KERNEL: - case DEVICE_KERNEL_UNKNOWN: - break; - } - } - - return result; - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { - /* Try to build and share a single acceleration structure, if possible */ - if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) { - devices.back().device->build_bvh(bvh, progress, refit); - return; - } - - assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || - bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE); - - BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh); - bvh_multi->sub_bvhs.resize(devices.size()); - - vector<BVHMulti *> geom_bvhs; - geom_bvhs.reserve(bvh->geometry.size()); - foreach (Geometry *geom, bvh->geometry) { - geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh)); - } - - /* Broadcast acceleration structure build to all render devices */ - size_t i = 0; - foreach (SubDevice &sub, devices) { - /* Change geometry BVH pointers to the sub BVH */ - for (size_t k = 0; k < bvh->geometry.size(); ++k) { - bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i]; - } - - if (!bvh_multi->sub_bvhs[i]) { - BVHParams params = bvh->params; - if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX) - params.bvh_layout = BVH_LAYOUT_OPTIX; - else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) - params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : - BVH_LAYOUT_EMBREE; - - /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree - * (since they are put into the top level directly, see bvh_embree.cpp) */ - if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE && - !bvh->geometry[0]->is_instanced()) { - i++; - continue; - } - - bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device); - } - - sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit); - i++; - } - - /* Change geometry BVH pointers back to the multi BVH. */ - for (size_t k = 0; k < bvh->geometry.size(); ++k) { - bvh->geometry[k]->bvh = geom_bvhs[k]; - } - } - - virtual void *osl_memory() override - { - if (devices.size() > 1) { - return NULL; - } - return devices.front().device->osl_memory(); - } - - bool is_resident(device_ptr key, Device *sub_device) override - { - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) { - return find_matching_mem_device(key, sub)->device == sub_device; - } - } - return false; - } - - SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub) - { - assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end())); - - /* Get the memory owner of this key (first try current device, then peer devices) */ - SubDevice *owner_sub = ⊂ - if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) { - foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) { - if (island_sub != owner_sub && - island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) { - owner_sub = island_sub; - } - } - } - return owner_sub; - } - - SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island) - { - assert(!island.empty()); - - /* Get the memory owner of this key or the device with the lowest memory usage when new */ - SubDevice *owner_sub = island.front(); - foreach (SubDevice *island_sub, island) { - if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) : - (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) { - owner_sub = island_sub; - } - } - return owner_sub; - } - - inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub) - { - return find_matching_mem_device(key, sub)->ptr_map[key]; - } - - void mem_alloc(device_memory &mem) override - { - device_ptr key = unique_key++; - - if (mem.type == MEM_PIXELS) { - /* Always allocate pixels memory on all devices - * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */ - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = 0; - mem.device_size = 0; - - sub.device->mem_alloc(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - else { - assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || - mem.type == MEM_DEVICE_ONLY); - /* The remaining memory types can be distributed across devices */ - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(key, island); - mem.device = owner_sub->device; - mem.device_pointer = 0; - mem.device_size = 0; - - owner_sub->device->mem_alloc(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size); - } - - void mem_copy_to(device_memory &mem) override - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key) ? existing_key : unique_key++; - size_t existing_size = mem.device_size; - - /* The tile buffers are allocated on each device (see below), so copy to all of them */ - if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) { - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_copy_to(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); - mem.device = owner_sub->device; - mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - owner_sub->device->mem_copy_to(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - - if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { - /* Need to create texture objects and update pointer in kernel globals on all devices */ - foreach (SubDevice *island_sub, island) { - if (island_sub != owner_sub) { - island_sub->device->mem_copy_to(mem); - } - } - } - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override - { - device_ptr key = mem.device_pointer; - int i = 0, sub_h = h / devices.size(); - - foreach (SubDevice &sub, devices) { - int sy = y + i * sub_h; - int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; - - SubDevice *owner_sub = find_matching_mem_device(key, sub); - mem.device = owner_sub->device; - mem.device_pointer = owner_sub->ptr_map[key]; - - owner_sub->device->mem_copy_from(mem, sy, w, sh, elem); - i++; - } - - mem.device = this; - mem.device_pointer = key; - } - - void mem_zero(device_memory &mem) override - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key) ? existing_key : unique_key++; - size_t existing_size = mem.device_size; - - /* This is a hack to only allocate the tile buffers on denoising devices - * Similarly the tile buffers also need to be allocated separately on all devices so any - * overlap rendered for denoising does not interfere with each other */ - if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) { - vector<device_ptr> device_pointers; - device_pointers.reserve(devices.size()); - - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - - device_pointers.push_back(mem.device_pointer); - } - foreach (SubDevice &sub, denoising_devices) { - if (matching_rendering_and_denoising_devices) { - sub.ptr_map[key] = device_pointers.front(); - device_pointers.erase(device_pointers.begin()); - } - else { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - } - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); - mem.device = owner_sub->device; - mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - owner_sub->device->mem_zero(mem); - owner_sub->ptr_map[key] = mem.device_pointer; - } - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_free(device_memory &mem) override - { - device_ptr key = mem.device_pointer; - size_t existing_size = mem.device_size; - - /* Free memory that was allocated for all devices (see above) on each device */ - if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) { - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } - foreach (SubDevice &sub, denoising_devices) { - if (matching_rendering_and_denoising_devices) { - sub.ptr_map.erase(key); - } - else { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } - } - } - else { - foreach (const vector<SubDevice *> &island, peer_islands) { - SubDevice *owner_sub = find_matching_mem_device(key, *island.front()); - mem.device = owner_sub->device; - mem.device_pointer = owner_sub->ptr_map[key]; - mem.device_size = existing_size; - - owner_sub->device->mem_free(mem); - owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key)); - - if (mem.type == MEM_TEXTURE) { - /* Free texture objects on all devices */ - foreach (SubDevice *island_sub, island) { - if (island_sub != owner_sub) { - island_sub->device->mem_free(mem); - } - } - } - } - } - - mem.device = this; - mem.device_pointer = 0; - mem.device_size = 0; - stats.mem_free(existing_size); - } - - void const_copy_to(const char *name, void *host, size_t size) override - { - foreach (SubDevice &sub, devices) - sub.device->const_copy_to(name, host, size); - } - - void draw_pixels(device_memory &rgba, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) override - { - assert(rgba.type == MEM_PIXELS); - - device_ptr key = rgba.device_pointer; - int i = 0, sub_h = h / devices.size(); - int sub_height = height / devices.size(); - - foreach (SubDevice &sub, devices) { - int sy = y + i * sub_h; - int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; - int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height; - int sdy = dy + i * sub_height; - /* adjust math for w/width */ - - rgba.device_pointer = sub.ptr_map[key]; - sub.device->draw_pixels( - rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params); - i++; - } - - rgba.device_pointer = key; - } - - void map_tile(Device *sub_device, RenderTile &tile) override - { - if (!tile.buffer) { - return; - } - - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) { - tile.buffer = find_matching_mem(tile.buffer, sub); - return; - } - } - - foreach (SubDevice &sub, denoising_devices) { - if (sub.device == sub_device) { - tile.buffer = sub.ptr_map[tile.buffer]; - return; - } - } - } - - int device_number(Device *sub_device) override - { - int i = 0; - - foreach (SubDevice &sub, devices) { - if (sub.device == sub_device) - return i; - i++; - } - - foreach (SubDevice &sub, denoising_devices) { - if (sub.device == sub_device) - return i; - i++; - } - - return -1; - } - - void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override - { - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &tile = neighbors.tiles[i]; - - if (!tile.buffers) { - continue; - } - - device_vector<float> &mem = tile.buffers->buffer; - tile.buffer = mem.device_pointer; - - if (mem.device == this && matching_rendering_and_denoising_devices) { - /* Skip unnecessary copies in viewport mode (buffer covers the - * whole image), but still need to fix up the tile device pointer. */ - map_tile(sub_device, tile); - continue; - } - - /* If the tile was rendered on another device, copy its memory to - * to the current device now, for the duration of the denoising task. - * Note that this temporarily modifies the RenderBuffers and calls - * the device, so this function is not thread safe. */ - if (mem.device != sub_device) { - /* Only copy from device to host once. This is faster, but - * also required for the case where a CPU thread is denoising - * a tile rendered on the GPU. In that case we have to avoid - * overwriting the buffer being de-noised by the CPU thread. */ - if (!tile.buffers->map_neighbor_copied) { - tile.buffers->map_neighbor_copied = true; - mem.copy_from_device(); - } - - if (mem.device == this) { - /* Can re-use memory if tile is already allocated on the sub device. */ - map_tile(sub_device, tile); - mem.swap_device(sub_device, mem.device_size, tile.buffer); - } - else { - mem.swap_device(sub_device, 0, 0); - } - - mem.copy_to_device(); - - tile.buffer = mem.device_pointer; - tile.device_size = mem.device_size; - - mem.restore_device(); - } - } - } - - void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override - { - RenderTile &target_tile = neighbors.target; - device_vector<float> &mem = target_tile.buffers->buffer; - - if (mem.device == this && matching_rendering_and_denoising_devices) { - return; - } - - /* Copy denoised result back to the host. */ - mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer); - mem.copy_from_device(); - mem.restore_device(); - - /* Copy denoised result to the original device. */ - mem.copy_to_device(); - - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - RenderTile &tile = neighbors.tiles[i]; - if (!tile.buffers) { - continue; - } - - device_vector<float> &mem = tile.buffers->buffer; - - if (mem.device != sub_device && mem.device != this) { - /* Free up memory again if it was allocated for the copy above. */ - mem.swap_device(sub_device, tile.device_size, tile.buffer); - sub_device->mem_free(mem); - mem.restore_device(); - } - } - } - - int get_split_task_count(DeviceTask &task) override - { - int total_tasks = 0; - list<DeviceTask> tasks; - task.split(tasks, devices.size()); - foreach (SubDevice &sub, devices) { - if (!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - total_tasks += sub.device->get_split_task_count(subtask); - } - } - return total_tasks; - } - - void task_add(DeviceTask &task) override - { - list<SubDevice> task_devices = devices; - if (!denoising_devices.empty()) { - if (task.type == DeviceTask::DENOISE_BUFFER) { - /* Denoising tasks should be redirected to the denoising devices entirely. */ - task_devices = denoising_devices; - } - else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) { - const uint tile_types = task.tile_types; - /* For normal rendering tasks only redirect the denoising part to the denoising devices. - * Do not need to split the task here, since they all run through 'acquire_tile'. */ - task.tile_types = RenderTile::DENOISE; - foreach (SubDevice &sub, denoising_devices) { - sub.device->task_add(task); - } - /* Rendering itself should still be executed on the rendering devices. */ - task.tile_types = tile_types ^ RenderTile::DENOISE; - } - } - - list<DeviceTask> tasks; - task.split(tasks, task_devices.size()); - - foreach (SubDevice &sub, task_devices) { - if (!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - if (task.buffer) - subtask.buffer = find_matching_mem(task.buffer, sub); - if (task.rgba_byte) - subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; - if (task.rgba_half) - subtask.rgba_half = sub.ptr_map[task.rgba_half]; - if (task.shader_input) - subtask.shader_input = find_matching_mem(task.shader_input, sub); - if (task.shader_output) - subtask.shader_output = find_matching_mem(task.shader_output, sub); - - sub.device->task_add(subtask); - - if (task.buffers && task.buffers->buffer.device == this) { - /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */ - sub.device->task_wait(); - } - } - } - } - - void task_wait() override - { - foreach (SubDevice &sub, devices) - sub.device->task_wait(); - foreach (SubDevice &sub, denoising_devices) - sub.device->task_wait(); - } - - void task_cancel() override - { - foreach (SubDevice &sub, devices) - sub.device->task_cancel(); - foreach (SubDevice &sub, denoising_devices) - sub.device->task_cancel(); - } -}; - -Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new MultiDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp deleted file mode 100644 index 8904b517e92..00000000000 --- a/intern/cycles/device/device_network.cpp +++ /dev/null @@ -1,812 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_network.h" -#include "device/device.h" -#include "device/device_intern.h" - -#include "util/util_foreach.h" -#include "util/util_logging.h" - -#if defined(WITH_NETWORK) - -CCL_NAMESPACE_BEGIN - -typedef map<device_ptr, device_ptr> PtrMap; -typedef vector<uint8_t> DataVector; -typedef map<device_ptr, DataVector> DataMap; - -/* tile list */ -typedef vector<RenderTile> TileList; - -/* search a list of tiles and find the one that matches the passed render tile */ -static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile) -{ - for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it) - if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample) - return it; - return tile_list.end(); -} - -class NetworkDevice : public Device { - public: - boost::asio::io_service io_service; - tcp::socket socket; - device_ptr mem_counter; - DeviceTask the_task; /* todo: handle multiple tasks */ - - thread_mutex rpc_lock; - - virtual bool show_samples() const - { - return false; - } - - NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address) - : Device(info, stats, profiler, true), socket(io_service) - { - error_func = NetworkError(); - stringstream portstr; - portstr << SERVER_PORT; - - tcp::resolver resolver(io_service); - tcp::resolver::query query(address, portstr.str()); - tcp::resolver::iterator endpoint_iterator = resolver.resolve(query); - tcp::resolver::iterator end; - - boost::system::error_code error = boost::asio::error::host_not_found; - while (error && endpoint_iterator != end) { - socket.close(); - socket.connect(*endpoint_iterator++, error); - } - - if (error) - error_func.network_error(error.message()); - - mem_counter = 0; - } - - ~NetworkDevice() - { - RPCSend snd(socket, &error_func, "stop"); - snd.write(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - void mem_alloc(device_memory &mem) - { - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - thread_scoped_lock lock(rpc_lock); - - mem.device_pointer = ++mem_counter; - - RPCSend snd(socket, &error_func, "mem_alloc"); - snd.add(mem); - snd.write(); - } - - void mem_copy_to(device_memory &mem) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_copy_to"); - - snd.add(mem); - snd.write(); - snd.write_buffer(mem.host_pointer, mem.memory_size()); - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) - { - thread_scoped_lock lock(rpc_lock); - - size_t data_size = mem.memory_size(); - - RPCSend snd(socket, &error_func, "mem_copy_from"); - - snd.add(mem); - snd.add(y); - snd.add(w); - snd.add(h); - snd.add(elem); - snd.write(); - - RPCReceive rcv(socket, &error_func); - rcv.read_buffer(mem.host_pointer, data_size); - } - - void mem_zero(device_memory &mem) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_zero"); - - snd.add(mem); - snd.write(); - } - - void mem_free(device_memory &mem) - { - if (mem.device_pointer) { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_free"); - - snd.add(mem); - snd.write(); - - mem.device_pointer = 0; - } - } - - void const_copy_to(const char *name, void *host, size_t size) - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "const_copy_to"); - - string name_string(name); - - snd.add(name_string); - snd.add(size); - snd.write(); - snd.write_buffer(host, size); - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) - { - if (error_func.have_error()) - return false; - - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(requested_features.experimental); - snd.add(requested_features.max_closure); - snd.add(requested_features.max_nodes_group); - snd.add(requested_features.nodes_features); - snd.write(); - - bool result; - RPCReceive rcv(socket, &error_func); - rcv.read(result); - - return result; - } - - void task_add(DeviceTask &task) - { - thread_scoped_lock lock(rpc_lock); - - the_task = task; - - RPCSend snd(socket, &error_func, "task_add"); - snd.add(task); - snd.write(); - } - - void task_wait() - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "task_wait"); - snd.write(); - - lock.unlock(); - - TileList the_tiles; - - /* todo: run this threaded for connecting to multiple clients */ - for (;;) { - if (error_func.have_error()) - break; - - RenderTile tile; - - lock.lock(); - RPCReceive rcv(socket, &error_func); - - if (rcv.name == "acquire_tile") { - lock.unlock(); - - /* todo: watch out for recursive calls! */ - if (the_task.acquire_tile(this, tile)) { /* write return as bool */ - the_tiles.push_back(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - else { - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile_none"); - snd.write(); - lock.unlock(); - } - } - else if (rcv.name == "release_tile") { - rcv.read(tile); - lock.unlock(); - - TileList::iterator it = tile_list_find(the_tiles, tile); - if (it != the_tiles.end()) { - tile.buffers = it->buffers; - the_tiles.erase(it); - } - - assert(tile.buffers != NULL); - - the_task.release_tile(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "release_tile"); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_wait_done") { - lock.unlock(); - break; - } - else - lock.unlock(); - } - } - - void task_cancel() - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "task_cancel"); - snd.write(); - } - - int get_split_task_count(DeviceTask &) - { - return 1; - } - - private: - NetworkError error_func; -}; - -Device *device_network_create(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - const char *address) -{ - return new NetworkDevice(info, stats, profiler, address); -} - -void device_network_info(vector<DeviceInfo> &devices) -{ - DeviceInfo info; - - info.type = DEVICE_NETWORK; - info.description = "Network Device"; - info.id = "NETWORK"; - info.num = 0; - - /* todo: get this info from device */ - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.has_osl = false; - info.denoisers = DENOISER_NONE; - - devices.push_back(info); -} - -class DeviceServer { - public: - thread_mutex rpc_lock; - - void network_error(const string &message) - { - error_func.network_error(message); - } - - bool have_error() - { - return error_func.have_error(); - } - - DeviceServer(Device *device_, tcp::socket &socket_) - : device(device_), socket(socket_), stop(false), blocked_waiting(false) - { - error_func = NetworkError(); - } - - void listen() - { - /* receive remote function calls */ - for (;;) { - listen_step(); - - if (stop) - break; - } - } - - protected: - void listen_step() - { - thread_scoped_lock lock(rpc_lock); - RPCReceive rcv(socket, &error_func); - - if (rcv.name == "stop") - stop = true; - else - process(rcv, lock); - } - - /* create a memory buffer for a device buffer and insert it into mem_data */ - DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size) - { - /* create a new DataVector and insert it into mem_data */ - pair<DataMap::iterator, bool> data_ins = mem_data.insert( - DataMap::value_type(client_pointer, DataVector())); - - /* make sure it was a unique insertion */ - assert(data_ins.second); - - /* get a reference to the inserted vector */ - DataVector &data_v = data_ins.first->second; - - /* size the vector */ - data_v.resize(data_size); - - return data_v; - } - - DataVector &data_vector_find(device_ptr client_pointer) - { - DataMap::iterator i = mem_data.find(client_pointer); - assert(i != mem_data.end()); - return i->second; - } - - /* setup mapping and reverse mapping of client_pointer<->real_pointer */ - void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer) - { - pair<PtrMap::iterator, bool> mapins; - - /* insert mapping from client pointer to our real device pointer */ - mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer)); - assert(mapins.second); - - /* insert reverse mapping from real our device pointer to client pointer */ - mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer)); - assert(mapins.second); - } - - device_ptr device_ptr_from_client_pointer(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - return i->second; - } - - device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - - device_ptr result = i->second; - - /* erase the mapping */ - ptr_map.erase(i); - - /* erase the reverse mapping */ - PtrMap::iterator irev = ptr_imap.find(result); - assert(irev != ptr_imap.end()); - ptr_imap.erase(irev); - - /* erase the data vector */ - DataMap::iterator idata = mem_data.find(client_pointer); - assert(idata != mem_data.end()); - mem_data.erase(idata); - - return result; - } - - /* note that the lock must be already acquired upon entry. - * This is necessary because the caller often peeks at - * the header and delegates control to here when it doesn't - * specifically handle the current RPC. - * The lock must be unlocked before returning */ - void process(RPCReceive &rcv, thread_scoped_lock &lock) - { - if (rcv.name == "mem_alloc") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - /* Allocate host side data buffer. */ - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; - - /* Perform the allocation on the actual device. */ - device->mem_alloc(mem); - - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - else if (rcv.name == "mem_copy_to") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if (client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void *)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; - } - - /* Copy data from network into memory buffer. */ - rcv.read_buffer((uint8_t *)mem.host_pointer, data_size); - - /* Copy the data from the memory buffer to the device buffer. */ - device->mem_copy_to(mem); - - if (!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if (rcv.name == "mem_copy_from") { - string name; - network_device_memory mem(device); - int y, w, h, elem; - - rcv.read(mem, name); - rcv.read(y); - rcv.read(w); - rcv.read(h); - rcv.read(elem); - - device_ptr client_pointer = mem.device_pointer; - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - - DataVector &data_v = data_vector_find(client_pointer); - - mem.host_pointer = (device_ptr) & (data_v[0]); - - device->mem_copy_from(mem, y, w, h, elem); - - size_t data_size = mem.memory_size(); - - RPCSend snd(socket, &error_func, "mem_copy_from"); - snd.write(); - snd.write_buffer((uint8_t *)mem.host_pointer, data_size); - lock.unlock(); - } - else if (rcv.name == "mem_zero") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if (client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void *)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0; - } - - /* Zero memory. */ - device->mem_zero(mem); - - if (!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if (rcv.name == "mem_free") { - string name; - network_device_memory mem(device); - - rcv.read(mem, name); - lock.unlock(); - - device_ptr client_pointer = mem.device_pointer; - - mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); - - device->mem_free(mem); - } - else if (rcv.name == "const_copy_to") { - string name_string; - size_t size; - - rcv.read(name_string); - rcv.read(size); - - vector<char> host_vector(size); - rcv.read_buffer(&host_vector[0], size); - lock.unlock(); - - device->const_copy_to(name_string.c_str(), &host_vector[0], size); - } - else if (rcv.name == "load_kernels") { - DeviceRequestedFeatures requested_features; - rcv.read(requested_features.experimental); - rcv.read(requested_features.max_closure); - rcv.read(requested_features.max_nodes_group); - rcv.read(requested_features.nodes_features); - - bool result; - result = device->load_kernels(requested_features); - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(result); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_add") { - DeviceTask task; - - rcv.read(task); - lock.unlock(); - - if (task.buffer) - task.buffer = device_ptr_from_client_pointer(task.buffer); - - if (task.rgba_half) - task.rgba_half = device_ptr_from_client_pointer(task.rgba_half); - - if (task.rgba_byte) - task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte); - - if (task.shader_input) - task.shader_input = device_ptr_from_client_pointer(task.shader_input); - - if (task.shader_output) - task.shader_output = device_ptr_from_client_pointer(task.shader_output); - - task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); - task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); - task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, - this); - task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1); - task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this); - - device->task_add(task); - } - else if (rcv.name == "task_wait") { - lock.unlock(); - - blocked_waiting = true; - device->task_wait(); - blocked_waiting = false; - - lock.lock(); - RPCSend snd(socket, &error_func, "task_wait_done"); - snd.write(); - lock.unlock(); - } - else if (rcv.name == "task_cancel") { - lock.unlock(); - device->task_cancel(); - } - else if (rcv.name == "acquire_tile") { - AcquireEntry entry; - entry.name = rcv.name; - rcv.read(entry.tile); - acquire_queue.push_back(entry); - lock.unlock(); - } - else if (rcv.name == "acquire_tile_none") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else if (rcv.name == "release_tile") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else { - cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n"; - lock.unlock(); - } - } - - bool task_acquire_tile(Device *, RenderTile &tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - bool result = false; - - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.write(); - - do { - if (blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if (!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if (entry.name == "acquire_tile") { - tile = entry.tile; - - if (tile.buffer) - tile.buffer = ptr_map[tile.buffer]; - - result = true; - break; - } - else if (entry.name == "acquire_tile_none") { - break; - } - else { - cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n"; - } - } - } while (acquire_queue.empty() && !stop && !have_error()); - - return result; - } - - void task_update_progress_sample() - { - ; /* skip */ - } - - void task_update_tile_sample(RenderTile &) - { - ; /* skip */ - } - - void task_release_tile(RenderTile &tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - if (tile.buffer) - tile.buffer = ptr_imap[tile.buffer]; - - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "release_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - - do { - if (blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if (!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if (entry.name == "release_tile") { - lock.unlock(); - break; - } - else { - cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n"; - } - } - } while (acquire_queue.empty() && !stop); - } - - bool task_get_cancel() - { - return false; - } - - /* properties */ - Device *device; - tcp::socket &socket; - - /* mapping of remote to local pointer */ - PtrMap ptr_map; - PtrMap ptr_imap; - DataMap mem_data; - - struct AcquireEntry { - string name; - RenderTile tile; - }; - - thread_mutex acquire_mutex; - list<AcquireEntry> acquire_queue; - - bool stop; - bool blocked_waiting; - - private: - NetworkError error_func; - - /* todo: free memory and device (osl) on network error */ -}; - -void Device::server_run() -{ - try { - /* starts thread that responds to discovery requests */ - ServerDiscovery discovery; - - for (;;) { - /* accept connection */ - boost::asio::io_service io_service; - tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT)); - - tcp::socket socket(io_service); - acceptor.accept(socket); - - string remote_address = socket.remote_endpoint().address().to_string(); - printf("Connected to remote client at: %s\n", remote_address.c_str()); - - DeviceServer server(this, socket); - server.listen(); - - printf("Disconnected.\n"); - } - } - catch (exception &e) { - fprintf(stderr, "Network server exception: %s\n", e.what()); - } -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h deleted file mode 100644 index b3a0f6daa57..00000000000 --- a/intern/cycles/device/device_network.h +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_NETWORK_H__ -#define __DEVICE_NETWORK_H__ - -#ifdef WITH_NETWORK - -# include <boost/archive/binary_iarchive.hpp> -# include <boost/archive/binary_oarchive.hpp> -# include <boost/archive/text_iarchive.hpp> -# include <boost/archive/text_oarchive.hpp> -# include <boost/array.hpp> -# include <boost/asio.hpp> -# include <boost/bind.hpp> -# include <boost/serialization/vector.hpp> -# include <boost/thread.hpp> - -# include <deque> -# include <iostream> -# include <sstream> - -# include "render/buffers.h" - -# include "util/util_foreach.h" -# include "util/util_list.h" -# include "util/util_map.h" -# include "util/util_param.h" -# include "util/util_string.h" - -CCL_NAMESPACE_BEGIN - -using std::cerr; -using std::cout; -using std::exception; -using std::hex; -using std::setw; - -using boost::asio::ip::tcp; - -static const int SERVER_PORT = 5120; -static const int DISCOVER_PORT = 5121; -static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP"; -static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP"; - -# if 0 -typedef boost::archive::text_oarchive o_archive; -typedef boost::archive::text_iarchive i_archive; -# else -typedef boost::archive::binary_oarchive o_archive; -typedef boost::archive::binary_iarchive i_archive; -# endif - -/* Serialization of device memory */ - -class network_device_memory : public device_memory { - public: - network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY) - { - } - - ~network_device_memory() - { - device_pointer = 0; - }; - - vector<char> local_data; -}; - -/* Common network error function / object for both DeviceNetwork and DeviceServer. */ -class NetworkError { - public: - NetworkError() - { - error = ""; - error_count = 0; - } - - ~NetworkError() - { - } - - void network_error(const string &message) - { - error = message; - error_count += 1; - } - - bool have_error() - { - return true ? error_count > 0 : false; - } - - private: - string error; - int error_count; -}; - -/* Remote procedure call Send */ - -class RPCSend { - public: - RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "") - : name(name_), socket(socket_), archive(archive_stream), sent(false) - { - archive &name_; - error_func = e; - fprintf(stderr, "rpc send %s\n", name.c_str()); - } - - ~RPCSend() - { - } - - void add(const device_memory &mem) - { - archive &mem.data_type &mem.data_elements &mem.data_size; - archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; - archive &mem.type &string(mem.name); - archive &mem.interpolation &mem.extension; - archive &mem.device_pointer; - } - - template<typename T> void add(const T &data) - { - archive &data; - } - - void add(const DeviceTask &task) - { - int type = (int)task.type; - archive &type &task.x &task.y &task.w &task.h; - archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; - archive &task.offset &task.stride; - archive &task.shader_input &task.shader_output &task.shader_eval_type; - archive &task.shader_x &task.shader_w; - archive &task.need_finish_queue; - } - - void add(const RenderTile &tile) - { - archive &tile.x &tile.y &tile.w &tile.h; - archive &tile.start_sample &tile.num_samples &tile.sample; - archive &tile.resolution &tile.offset &tile.stride; - archive &tile.buffer; - } - - void write() - { - boost::system::error_code error; - - /* get string from stream */ - string archive_str = archive_stream.str(); - - /* first send fixed size header with size of following data */ - ostringstream header_stream; - header_stream << setw(8) << hex << archive_str.size(); - string header_str = header_stream.str(); - - boost::asio::write( - socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - - /* then send actual data */ - boost::asio::write( - socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - - sent = true; - } - - void write_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - - boost::asio::write( - socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error); - - if (error.value()) - error_func->network_error(error.message()); - } - - protected: - string name; - tcp::socket &socket; - ostringstream archive_stream; - o_archive archive; - bool sent; - NetworkError *error_func; -}; - -/* Remote procedure call Receive */ - -class RPCReceive { - public: - RPCReceive(tcp::socket &socket_, NetworkError *e) - : socket(socket_), archive_stream(NULL), archive(NULL) - { - error_func = e; - /* read head with fixed size */ - vector<char> header(8); - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(header), error); - - if (error.value()) { - error_func->network_error(error.message()); - } - - /* verify if we got something */ - if (len == header.size()) { - /* decode header */ - string header_str(&header[0], header.size()); - istringstream header_stream(header_str); - - size_t data_size; - - if ((header_stream >> hex >> data_size)) { - - vector<char> data(data_size); - size_t len = boost::asio::read(socket, boost::asio::buffer(data), error); - - if (error.value()) - error_func->network_error(error.message()); - - if (len == data_size) { - archive_str = (data.size()) ? string(&data[0], data.size()) : string(""); - - archive_stream = new istringstream(archive_str); - archive = new i_archive(*archive_stream); - - *archive &name; - fprintf(stderr, "rpc receive %s\n", name.c_str()); - } - else { - error_func->network_error("Network receive error: data size doesn't match header"); - } - } - else { - error_func->network_error("Network receive error: can't decode data size from header"); - } - } - else { - error_func->network_error("Network receive error: invalid header size"); - } - } - - ~RPCReceive() - { - delete archive; - delete archive_stream; - } - - void read(network_device_memory &mem, string &name) - { - *archive &mem.data_type &mem.data_elements &mem.data_size; - *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; - *archive &mem.type &name; - *archive &mem.interpolation &mem.extension; - *archive &mem.device_pointer; - - mem.name = name.c_str(); - mem.host_pointer = 0; - - /* Can't transfer OpenGL texture over network. */ - if (mem.type == MEM_PIXELS) { - mem.type = MEM_READ_WRITE; - } - } - - template<typename T> void read(T &data) - { - *archive &data; - } - - void read_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error); - - if (error.value()) { - error_func->network_error(error.message()); - } - - if (len != size) - cout << "Network receive error: buffer size doesn't match expected size\n"; - } - - void read(DeviceTask &task) - { - int type; - - *archive &type &task.x &task.y &task.w &task.h; - *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; - *archive &task.offset &task.stride; - *archive &task.shader_input &task.shader_output &task.shader_eval_type; - *archive &task.shader_x &task.shader_w; - *archive &task.need_finish_queue; - - task.type = (DeviceTask::Type)type; - } - - void read(RenderTile &tile) - { - *archive &tile.x &tile.y &tile.w &tile.h; - *archive &tile.start_sample &tile.num_samples &tile.sample; - *archive &tile.resolution &tile.offset &tile.stride; - *archive &tile.buffer; - - tile.buffers = NULL; - } - - string name; - - protected: - tcp::socket &socket; - string archive_str; - istringstream *archive_stream; - i_archive *archive; - NetworkError *error_func; -}; - -/* Server auto discovery */ - -class ServerDiscovery { - public: - explicit ServerDiscovery(bool discover = false) - : listen_socket(io_service), collect_servers(false) - { - /* setup listen socket */ - listen_endpoint.address(boost::asio::ip::address_v4::any()); - listen_endpoint.port(DISCOVER_PORT); - - listen_socket.open(listen_endpoint.protocol()); - - boost::asio::socket_base::reuse_address option(true); - listen_socket.set_option(option); - - listen_socket.bind(listen_endpoint); - - /* setup receive callback */ - async_receive(); - - /* start server discovery */ - if (discover) { - collect_servers = true; - servers.clear(); - - broadcast_message(DISCOVER_REQUEST_MSG); - } - - /* start thread */ - work = new boost::asio::io_service::work(io_service); - thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service)); - } - - ~ServerDiscovery() - { - io_service.stop(); - thread->join(); - delete thread; - delete work; - } - - vector<string> get_server_list() - { - vector<string> result; - - mutex.lock(); - result = vector<string>(servers.begin(), servers.end()); - mutex.unlock(); - - return result; - } - - private: - void handle_receive_from(const boost::system::error_code &error, size_t size) - { - if (error) { - cout << "Server discovery receive error: " << error.message() << "\n"; - return; - } - - if (size > 0) { - string msg = string(receive_buffer, size); - - /* handle incoming message */ - if (collect_servers) { - if (msg == DISCOVER_REPLY_MSG) { - string address = receive_endpoint.address().to_string(); - - mutex.lock(); - - /* add address if it's not already in the list */ - bool found = std::find(servers.begin(), servers.end(), address) != servers.end(); - - if (!found) - servers.push_back(address); - - mutex.unlock(); - } - } - else { - /* reply to request */ - if (msg == DISCOVER_REQUEST_MSG) - broadcast_message(DISCOVER_REPLY_MSG); - } - } - - async_receive(); - } - - void async_receive() - { - listen_socket.async_receive_from(boost::asio::buffer(receive_buffer), - receive_endpoint, - boost::bind(&ServerDiscovery::handle_receive_from, - this, - boost::asio::placeholders::error, - boost::asio::placeholders::bytes_transferred)); - } - - void broadcast_message(const string &msg) - { - /* setup broadcast socket */ - boost::asio::ip::udp::socket socket(io_service); - - socket.open(boost::asio::ip::udp::v4()); - - boost::asio::socket_base::broadcast option(true); - socket.set_option(option); - - boost::asio::ip::udp::endpoint broadcast_endpoint( - boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT); - - /* broadcast message */ - socket.send_to(boost::asio::buffer(msg), broadcast_endpoint); - } - - /* network service and socket */ - boost::asio::io_service io_service; - boost::asio::ip::udp::endpoint listen_endpoint; - boost::asio::ip::udp::socket listen_socket; - - /* threading */ - boost::thread *thread; - boost::asio::io_service::work *work; - boost::mutex mutex; - - /* buffer and endpoint for receiving messages */ - char receive_buffer[256]; - boost::asio::ip::udp::endpoint receive_endpoint; - - // os, version, devices, status, host name, group name, ip as far as fields go - struct ServerInfo { - string cycles_version; - string os; - int device_count; - string status; - string host_name; - string group_name; - string host_addr; - }; - - /* collection of server addresses in list */ - bool collect_servers; - vector<string> servers; -}; - -CCL_NAMESPACE_END - -#endif - -#endif /* __DEVICE_NETWORK_H__ */ diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp deleted file mode 100644 index 9abb7cfb7fe..00000000000 --- a/intern/cycles/device/device_opencl.cpp +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/opencl/device_opencl.h" -# include "device/device.h" -# include "device/device_intern.h" - -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_set.h" -# include "util/util_string.h" - -CCL_NAMESPACE_BEGIN - -Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return opencl_create_split_device(info, stats, profiler, background); -} - -bool device_opencl_init() -{ - static bool initialized = false; - static bool result = false; - - if (initialized) - return result; - - initialized = true; - - if (OpenCLInfo::device_type() != 0) { - int clew_result = clewInit(); - if (clew_result == CLEW_SUCCESS) { - VLOG(1) << "CLEW initialization succeeded."; - result = true; - } - else { - VLOG(1) << "CLEW initialization failed: " - << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : - "Error opening the library"); - } - } - else { - VLOG(1) << "Skip initializing CLEW, platform is force disabled."; - result = false; - } - - return result; -} - -static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms) -{ -# ifdef _WIN32 - __try { - return clGetPlatformIDs(0, NULL, num_platforms); - } - __except (EXCEPTION_EXECUTE_HANDLER) { - /* Ignore crashes inside the OpenCL driver and hope we can - * survive even with corrupted OpenCL installs. */ - fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); - } - - *num_platforms = 0; - return CL_DEVICE_NOT_FOUND; -# else - return clGetPlatformIDs(0, NULL, num_platforms); -# endif -} - -void device_opencl_info(vector<DeviceInfo> &devices) -{ - cl_uint num_platforms = 0; - device_opencl_get_num_platforms_safe(&num_platforms); - if (num_platforms == 0) { - return; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - /* Devices are numbered consecutively across platforms. */ - int num_devices = 0; - set<string> unique_ids; - foreach (OpenCLPlatformDevice &platform_device, usable_devices) { - /* Compute unique ID for persistent user preferences. */ - const string &platform_name = platform_device.platform_name; - const string &device_name = platform_device.device_name; - string hardware_id = platform_device.hardware_id; - if (hardware_id == "") { - hardware_id = string_printf("ID_%d", num_devices); - } - string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; - - /* Hardware ID might not be unique, add device number in that case. */ - if (unique_ids.find(id) != unique_ids.end()) { - id += string_printf("_ID_%d", num_devices); - } - unique_ids.insert(id); - - /* Create DeviceInfo. */ - DeviceInfo info; - info.type = DEVICE_OPENCL; - info.description = string_remove_trademark(string(device_name)); - info.num = num_devices; - /* We don't know if it's used for display, but assume it is. */ - info.display_device = true; - info.use_split_kernel = true; - info.has_volume_decoupled = false; - info.has_adaptive_stop_per_sample = false; - info.denoisers = DENOISER_NLM; - info.id = id; - - /* Check OpenCL extensions */ - info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos; - - /* Disabled for now due to apparent AMD driver bug. */ - info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing"; - - devices.push_back(info); - num_devices++; - } -} - -string device_opencl_capabilities() -{ - if (OpenCLInfo::device_type() == 0) { - return "All OpenCL devices are forced to be OFF"; - } - string result = ""; - string error_msg = ""; /* Only used by opencl_assert(), but in the future - * it could also be nicely reported to the console. - */ - cl_uint num_platforms = 0; - opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); - if (num_platforms == 0) { - return "No OpenCL platforms found\n"; - } - result += string_printf("Number of platforms: %u\n", num_platforms); - - vector<cl_platform_id> platform_ids; - platform_ids.resize(num_platforms); - opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL)); - -# define APPEND_INFO(func, id, name, what, type) \ - do { \ - type data; \ - memset(&data, 0, sizeof(data)); \ - opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ - result += string_printf("%s: %s\n", name, to_string(data).c_str()); \ - } while (false) -# define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \ - do { \ - string value; \ - size_t length = 0; \ - if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \ - vector<char> buffer(length + 1); \ - if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \ - value = string(buffer.data()); \ - } \ - } \ - if (is_optional && !(length != 0 && value[0] != '\0')) { \ - break; \ - } \ - result += string_printf("%s: %s\n", name, value.c_str()); \ - } while (false) -# define APPEND_PLATFORM_STRING_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false) -# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true) -# define APPEND_PLATFORM_INFO(id, name, what, type) \ - APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type) -# define APPEND_DEVICE_INFO(id, name, what, type) \ - APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type) -# define APPEND_DEVICE_STRING_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false) -# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ - APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true) - - vector<cl_device_id> device_ids; - for (cl_uint platform = 0; platform < num_platforms; ++platform) { - cl_platform_id platform_id = platform_ids[platform]; - - result += string_printf("Platform #%u\n", platform); - - APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME); - APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR); - APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION); - APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE); - APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS); - - cl_uint num_devices = 0; - opencl_assert( - clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)); - result += string_printf("\tNumber of devices: %u\n", num_devices); - - device_ids.resize(num_devices); - opencl_assert(clGetDeviceIDs( - platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL)); - for (cl_uint device = 0; device < num_devices; ++device) { - cl_device_id device_id = device_ids[device]; - - result += string_printf("\t\tDevice: #%u\n", device); - - APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME); - APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); - APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR); - APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION); - APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE); - APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION); - APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS); - APPEND_DEVICE_INFO( - device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t); - } - } - -# undef APPEND_INFO -# undef APPEND_STRING_INFO_IMPL -# undef APPEND_PLATFORM_STRING_INFO -# undef APPEND_STRING_EXTENSION_INFO -# undef APPEND_PLATFORM_INFO -# undef APPEND_DEVICE_INFO -# undef APPEND_DEVICE_STRING_INFO -# undef APPEND_DEVICE_STRING_EXTENSION_INFO - - return result; -} - -CCL_NAMESPACE_END - -#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp deleted file mode 100644 index 6f9a7943722..00000000000 --- a/intern/cycles/device/device_optix.cpp +++ /dev/null @@ -1,1936 +0,0 @@ -/* - * Copyright 2019, NVIDIA Corporation. - * Copyright 2019, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPTIX - -# include "bvh/bvh.h" -# include "bvh/bvh_optix.h" -# include "device/cuda/device_cuda.h" -# include "device/device_denoising.h" -# include "device/device_intern.h" -# include "render/buffers.h" -# include "render/hair.h" -# include "render/mesh.h" -# include "render/object.h" -# include "render/scene.h" -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_progress.h" -# include "util/util_time.h" - -# ifdef WITH_CUDA_DYNLOAD -# include <cuew.h> -// Do not use CUDA SDK headers when using CUEW -# define OPTIX_DONT_INCLUDE_CUDA -# endif -# include <optix_function_table_definition.h> -# include <optix_stubs.h> - -// TODO(pmours): Disable this once drivers have native support -# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1 - -CCL_NAMESPACE_BEGIN - -/* Make sure this stays in sync with kernel_globals.h */ -struct ShaderParams { - uint4 *input; - float4 *output; - int type; - int filter; - int sx; - int offset; - int sample; -}; -struct KernelParams { - WorkTile tile; - KernelData data; - ShaderParams shader; -# define KERNEL_TEX(type, name) const type *name; -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX -}; - -# define check_result_cuda(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_cuda_ret(stmt) \ - { \ - CUresult res = stmt; \ - if (res != CUDA_SUCCESS) { \ - const char *name; \ - cuGetErrorName(res, &name); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define check_result_optix(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return; \ - } \ - } \ - (void)0 -# define check_result_optix_ret(stmt) \ - { \ - enum OptixResult res = stmt; \ - if (res != OPTIX_SUCCESS) { \ - const char *name = optixGetErrorName(res); \ - set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \ - return false; \ - } \ - } \ - (void)0 - -# define launch_filter_kernel(func_name, w, h, args) \ - { \ - CUfunction func; \ - check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \ - check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - threads = (int)sqrt((float)threads); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; \ - check_result_cuda_ret( \ - cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \ - } \ - (void)0 - -class OptiXDevice : public CUDADevice { - - // List of OptiX program groups - enum { - PG_RGEN, - PG_MISS, - PG_HITD, // Default hit group - PG_HITS, // __SHADOW_RECORD_ALL__ hit group - PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles) -# if OPTIX_ABI_VERSION >= 36 - PG_HITD_MOTION, - PG_HITS_MOTION, -# endif - PG_BAKE, // kernel_bake_evaluate - PG_DISP, // kernel_displace_evaluate - PG_BACK, // kernel_background_evaluate - PG_CALL, - NUM_PROGRAM_GROUPS = PG_CALL + 3 - }; - - // List of OptiX pipelines - enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES }; - - // A single shader binding table entry - struct SbtRecord { - char header[OPTIX_SBT_RECORD_HEADER_SIZE]; - }; - - // Information stored about CUDA memory allocations - struct CUDAMem { - bool free_map_host = false; - CUarray array = NULL; - CUtexObject texobject = 0; - bool use_mapped_host = false; - }; - - // Helper class to manage current CUDA context - struct CUDAContextScope { - CUDAContextScope(CUcontext ctx) - { - cuCtxPushCurrent(ctx); - } - ~CUDAContextScope() - { - cuCtxPopCurrent(NULL); - } - }; - - // Use a pool with multiple threads to support launches with multiple CUDA streams - TaskPool task_pool; - - vector<CUstream> cuda_stream; - OptixDeviceContext context = NULL; - - OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module - OptixModule builtin_modules[2] = {}; - OptixPipeline pipelines[NUM_PIPELINES] = {}; - - bool motion_blur = false; - device_vector<SbtRecord> sbt_data; - device_only_memory<KernelParams> launch_params; - OptixTraversableHandle tlas_handle = 0; - - OptixDenoiser denoiser = NULL; - device_only_memory<unsigned char> denoiser_state; - int denoiser_input_passes = 0; - - vector<device_only_memory<char>> delayed_free_bvh_memory; - thread_mutex delayed_free_bvh_mutex; - - public: - OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : CUDADevice(info_, stats_, profiler_, background_), - sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params", false), - denoiser_state(this, "__denoiser_state", true) - { - // Store number of CUDA streams in device info - info.cpu_threads = DebugFlags().optix.cuda_streams; - - // Make the CUDA context current - if (!cuContext) { - return; // Do not initialize if CUDA context creation failed already - } - const CUDAContextScope scope(cuContext); - - // Create OptiX context for this device - OptixDeviceContextOptions options = {}; -# ifdef WITH_CYCLES_LOGGING - options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4 - options.logCallbackFunction = - [](unsigned int level, const char *, const char *message, void *) { - switch (level) { - case 1: - LOG_IF(FATAL, VLOG_IS_ON(1)) << message; - break; - case 2: - LOG_IF(ERROR, VLOG_IS_ON(1)) << message; - break; - case 3: - LOG_IF(WARNING, VLOG_IS_ON(1)) << message; - break; - case 4: - LOG_IF(INFO, VLOG_IS_ON(1)) << message; - break; - } - }; -# endif - check_result_optix(optixDeviceContextCreate(cuContext, &options, &context)); -# ifdef WITH_CYCLES_LOGGING - check_result_optix(optixDeviceContextSetLogCallback( - context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); -# endif - - // Create launch streams - cuda_stream.resize(info.cpu_threads); - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING)); - - // Fix weird compiler bug that assigns wrong size - launch_params.data_elements = sizeof(KernelParams); - // Allocate launch parameter buffer memory on device - launch_params.alloc_to_device(info.cpu_threads); - } - ~OptiXDevice() - { - // Stop processing any more tasks - task_pool.cancel(); - - // Make CUDA context current - const CUDAContextScope scope(cuContext); - - free_bvh_memory_delayed(); - - sbt_data.free(); - texture_info.free(); - launch_params.free(); - denoiser_state.free(); - - // Unload modules - if (optix_module != NULL) - optixModuleDestroy(optix_module); - for (unsigned int i = 0; i < 2; ++i) - if (builtin_modules[i] != NULL) - optixModuleDestroy(builtin_modules[i]); - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) - if (pipelines[i] != NULL) - optixPipelineDestroy(pipelines[i]); - - // Destroy launch streams - for (CUstream stream : cuda_stream) - cuStreamDestroy(stream); - - if (denoiser != NULL) - optixDenoiserDestroy(denoiser); - - optixDeviceContextDestroy(context); - } - - private: - bool show_samples() const override - { - // Only show samples if not rendering multiple tiles in parallel - return info.cpu_threads == 1; - } - - BVHLayoutMask get_bvh_layout_mask() const override - { - // CUDA kernels are used when doing baking, so need to build a BVH those can understand too! - if (optix_module == NULL) - return CUDADevice::get_bvh_layout_mask(); - - // OptiX has its own internal acceleration structure format - return BVH_LAYOUT_OPTIX; - } - - string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, - bool filter, - bool /*split*/) override - { - // Split kernel is not supported in OptiX - string common_cflags = CUDADevice::compile_kernel_get_common_cflags( - requested_features, filter, false); - - // Add OptiX SDK include directory to include paths - const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); - if (optix_sdk_path) { - common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); - } - - // Specialization for shader raytracing - if (requested_features.use_shader_raytrace) { - common_cflags += " --keep-device-functions"; - } - else { - common_cflags += " -D __NO_SHADER_RAYTRACE__"; - } - - return common_cflags; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) override - { - if (have_error()) { - // Abort early if context creation failed already - return false; - } - - // Load CUDA modules because we need some of the utility kernels - if (!CUDADevice::load_kernels(requested_features)) { - return false; - } - - // Baking is currently performed using CUDA, so no need to load OptiX kernels - if (requested_features.use_baking) { - return true; - } - - const CUDAContextScope scope(cuContext); - - // Unload existing OptiX module and pipelines first - if (optix_module != NULL) { - optixModuleDestroy(optix_module); - optix_module = NULL; - } - for (unsigned int i = 0; i < 2; ++i) { - if (builtin_modules[i] != NULL) { - optixModuleDestroy(builtin_modules[i]); - builtin_modules[i] = NULL; - } - } - for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { - if (pipelines[i] != NULL) { - optixPipelineDestroy(pipelines[i]); - pipelines[i] = NULL; - } - } - - OptixModuleCompileOptions module_options = {}; - module_options.maxRegisterCount = 0; // Do not set an explicit register limit - module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; - module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; - -# if OPTIX_ABI_VERSION >= 41 - module_options.boundValues = nullptr; - module_options.numBoundValues = 0; -# endif - - OptixPipelineCompileOptions pipeline_options = {}; - // Default to no motion blur and two-level graph, since it is the fastest option - pipeline_options.usesMotionBlur = false; - pipeline_options.traversableGraphFlags = - OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; - pipeline_options.numPayloadValues = 6; - pipeline_options.numAttributeValues = 2; // u, v - pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; - pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h - -# if OPTIX_ABI_VERSION >= 36 - pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; - if (requested_features.use_hair) { - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; - } - else { - pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; - } - } -# endif - - // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds - // This is necessary since objects may be reported to have motion if the Vector pass is - // active, but may still need to be rendered without motion blur if that isn't active as well - motion_blur = requested_features.use_object_motion; - - if (motion_blur) { - pipeline_options.usesMotionBlur = true; - // Motion blur can insert motion transforms into the traversal graph - // It is no longer a two-level graph then, so need to set flags to allow any configuration - pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; - } - - { // Load and compile PTX module with OptiX kernels - string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); - if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { - if (!getenv("OPTIX_ROOT_DIR")) { - set_error( - "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " - "the Optix SDK to be able to compile Optix kernels on demand)."); - return false; - } - ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true); - } - if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { - set_error("Failed to load OptiX kernel from '" + ptx_filename + "'"); - return false; - } - - check_result_optix_ret(optixModuleCreateFromPTX(context, - &module_options, - &pipeline_options, - ptx_data.data(), - ptx_data.size(), - nullptr, - 0, - &optix_module)); - } - - // Create program groups - OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; - OptixProgramGroupOptions group_options = {}; // There are no options currently - group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_RGEN].raygen.module = optix_module; - // Ignore branched integrator for now (see "requested_features.use_integrator_branched") - group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace"; - group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; - group_descs[PG_MISS].miss.module = optix_module; - group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; - group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITD].hitgroup.moduleCH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; - group_descs[PG_HITD].hitgroup.moduleAH = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; - group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITS].hitgroup.moduleAH = optix_module; - group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; - - if (requested_features.use_hair) { - group_descs[PG_HITD].hitgroup.moduleIS = optix_module; - group_descs[PG_HITS].hitgroup.moduleIS = optix_module; - - // Add curve intersection programs - if (requested_features.use_hair_thick) { - // Slower programs for thick hair since that also slows down ribbons. - // Ideally this should not be needed. - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; - } - else { - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; - } - -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { - OptixBuiltinISOptions builtin_options = {}; - builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - builtin_options.usesMotionBlur = false; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); - - group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; - group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; - - if (motion_blur) { - builtin_options.usesMotionBlur = true; - - check_result_optix_ret(optixBuiltinISModuleGet( - context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); - - group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; - group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; - group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; - group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; - } - } -# endif - } - - if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { - // Add hit group for local intersections - group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - group_descs[PG_HITL].hitgroup.moduleAH = optix_module; - group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; - } - - if (requested_features.use_baking) { - group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BAKE].raygen.module = optix_module; - group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake"; - } - - if (requested_features.use_true_displacement) { - group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_DISP].raygen.module = optix_module; - group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace"; - } - - if (requested_features.use_background_light) { - group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; - group_descs[PG_BACK].raygen.module = optix_module; - group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background"; - } - - // Shader raytracing replaces some functions with direct callables - if (requested_features.use_shader_raytrace) { - group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 0].callables.moduleDC = optix_module; - group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes"; - group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 1].callables.moduleDC = optix_module; - group_descs[PG_CALL + 1].callables.entryFunctionNameDC = - "__direct_callable__kernel_volume_shadow"; - group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; - group_descs[PG_CALL + 2].callables.moduleDC = optix_module; - group_descs[PG_CALL + 2].callables.entryFunctionNameDC = - "__direct_callable__subsurface_scatter_multi_setup"; - } - - check_result_optix_ret(optixProgramGroupCreate( - context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); - - // Get program stack sizes - OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; - // Set up SBT, which in this case is used only to select between different programs - sbt_data.alloc(NUM_PROGRAM_GROUPS); - memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); - check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); - } - sbt_data.copy_to_device(); // Upload SBT to device - - // Calculate maximum trace continuation stack size - unsigned int trace_css = stack_size[PG_HITD].cssCH; - // This is based on the maximum of closest-hit and any-hit/intersection programs - trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); -# if OPTIX_ABI_VERSION >= 36 - trace_css = std::max(trace_css, - stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); - trace_css = std::max(trace_css, - stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); -# endif - - OptixPipelineLinkOptions link_options = {}; - link_options.maxTraceDepth = 1; - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; -# if OPTIX_ABI_VERSION < 24 - link_options.overrideUsesMotionBlur = motion_blur; -# endif - - { // Create path tracing pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_RGEN]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_PATH_TRACE])); - - // Combine ray generation and trace continuation stack size - const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; - // Max direct callable depth is one of the following, so combine accordingly - // - __raygen__ -> svm_eval_nodes - // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes - // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - // Set stack size depending on pipeline options - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Only need to create shader evaluation pipeline if one of these features is used: - const bool use_shader_eval_pipeline = requested_features.use_baking || - requested_features.use_background_light || - requested_features.use_true_displacement; - - if (use_shader_eval_pipeline) { // Create shader evaluation pipeline - vector<OptixProgramGroup> pipeline_groups; - pipeline_groups.reserve(NUM_PROGRAM_GROUPS); - pipeline_groups.push_back(groups[PG_BAKE]); - pipeline_groups.push_back(groups[PG_DISP]); - pipeline_groups.push_back(groups[PG_BACK]); - pipeline_groups.push_back(groups[PG_MISS]); - pipeline_groups.push_back(groups[PG_HITD]); - pipeline_groups.push_back(groups[PG_HITS]); - pipeline_groups.push_back(groups[PG_HITL]); -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur) { - pipeline_groups.push_back(groups[PG_HITD_MOTION]); - pipeline_groups.push_back(groups[PG_HITS_MOTION]); - } -# endif - if (requested_features.use_shader_raytrace) { - pipeline_groups.push_back(groups[PG_CALL + 0]); - pipeline_groups.push_back(groups[PG_CALL + 1]); - pipeline_groups.push_back(groups[PG_CALL + 2]); - } - - check_result_optix_ret(optixPipelineCreate(context, - &pipeline_options, - &link_options, - pipeline_groups.data(), - pipeline_groups.size(), - nullptr, - 0, - &pipelines[PIP_SHADER_EVAL])); - - // Calculate continuation stack size based on the maximum of all ray generation stack sizes - const unsigned int css = std::max(stack_size[PG_BAKE].cssRG, - std::max(stack_size[PG_DISP].cssRG, - stack_size[PG_BACK].cssRG)) + - link_options.maxTraceDepth * trace_css; - const unsigned int dss = stack_size[PG_CALL + 0].dssDC + - std::max(stack_size[PG_CALL + 1].dssDC, - stack_size[PG_CALL + 2].dssDC); - - check_result_optix_ret( - optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL], - 0, - requested_features.use_shader_raytrace ? dss : 0, - css, - motion_blur ? 3 : 2)); - } - - // Clean up program group objects - for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { - optixProgramGroupDestroy(groups[i]); - } - - return true; - } - - void thread_run(DeviceTask &task, int thread_index) // Main task entry point - { - if (have_error()) - return; // Abort early if there was an error previously - - if (task.type == DeviceTask::RENDER) { - if (thread_index != 0) { - // Only execute denoising in a single thread (see also 'task_add') - task.tile_types &= ~RenderTile::DENOISE; - } - - RenderTile tile; - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) - launch_render(task, tile, thread_index); - else if (tile.task == RenderTile::BAKE) { - // Perform baking using CUDA, since it is not currently implemented in OptiX - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - CUDADevice::render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::DENOISE) - launch_denoise(task, tile); - task.release_tile(tile); - if (task.get_cancel() && !task.need_finish_queue) - break; // User requested cancellation - else if (have_error()) - break; // Abort rendering when encountering an error - } - } - else if (task.type == DeviceTask::SHADER) { - // CUDA kernels are used when doing baking - if (optix_module == NULL) - CUDADevice::shader(task); - else - launch_shader_eval(task, thread_index); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - // Set up a single tile that covers the whole task and denoise it - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - launch_denoise(task, tile); - } - } - - void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index) - { - assert(thread_index < launch_params.data_size); - - // Keep track of total render time of this tile - const scoped_timer timer(&rtile.buffers->render_time); - - WorkTile wtile; - wtile.x = rtile.x; - wtile.y = rtile.y; - wtile.w = rtile.w; - wtile.h = rtile.h; - wtile.offset = rtile.offset; - wtile.stride = rtile.stride; - wtile.buffer = (float *)rtile.buffer; - - const int end_sample = rtile.start_sample + rtile.num_samples; - // Keep this number reasonable to avoid running into TDRs - int step_samples = (info.display_device ? 8 : 32); - - // Offset into launch params buffer so that streams use separate data - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - const CUDAContextScope scope(cuContext); - - for (int sample = rtile.start_sample; sample < end_sample;) { - // Copy work tile information to device - wtile.start_sample = sample; - wtile.num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile.num_samples = min(wtile.num_samples, end_sample - sample); - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - check_result_cuda( - cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - // Launch the ray generation program - check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - // Launch with samples close to each other for better locality - wtile.w * wtile.num_samples, - wtile.h, - 1)); - - // Run the adaptive sampling kernels at selected samples aligned to step samples. - uint filter_sample = wtile.start_sample + wtile.num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - } - - // Wait for launch to finish - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - // Update current sample, so it is displayed correctly - sample += wtile.num_samples; - rtile.sample = sample; - // Update task progress after the kernel completed rendering - task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); - - if (task.get_cancel() && !task.need_finish_queue) - return; // Cancel rendering - } - - // Finalize adaptive sampling - if (task.adaptive_sampling.use) { - device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); - adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples); - } - } - - bool launch_denoise(DeviceTask &task, RenderTile &rtile) - { - // Update current sample (for display and NLM denoising task) - rtile.sample = rtile.start_sample + rtile.num_samples; - - // Make CUDA context current now, since it is used for both denoising tasks - const CUDAContextScope scope(cuContext); - - // Choose between OptiX and NLM denoising - if (task.denoising.type == DENOISER_OPTIX) { - // Map neighboring tiles onto this device, indices are as following: - // Where index 4 is the center tile and index 9 is the target for the result. - // 0 1 2 - // 3 4 5 - // 6 7 8 9 - RenderTileNeighbors neighbors(rtile); - task.map_neighbor_tiles(neighbors, this); - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - RenderTile &target_tile = neighbors.target; - rtile = center_tile; // Tile may have been modified by mapping code - - // Calculate size of the tile to denoise (including overlap) - int4 rect = center_tile.bounds(); - // Overlap between tiles has to be at least 64 pixels - // TODO(pmours): Query this value from OptiX - rect = rect_expand(rect, 64); - int4 clip_rect = neighbors.bounds(); - rect = rect_clip(rect, clip_rect); - int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); - int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); - - // Calculate byte offsets and strides - int pixel_stride = task.pass_stride * (int)sizeof(float); - int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride; - const int pass_offset[3] = { - (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float), - (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)}; - - // Start with the current tile pointer offset - int input_stride = pixel_stride; - device_ptr input_ptr = rtile.buffer + pixel_offset; - - // Copy tile data into a common buffer if necessary - device_only_memory<float> input(this, "denoiser input", true); - device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY); - - bool contiguous_memory = true; - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { - contiguous_memory = false; - } - } - - if (contiguous_memory) { - // Tiles are in continous memory, so can just subtract overlap offset - input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; - // Stride covers the whole width of the image and not just a single tile - input_stride *= rtile.stride; - } - else { - // Adjacent tiles are in separate memory regions, so need to copy them into a single one - input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride); - // Start with the new input buffer - input_ptr = input.device_pointer; - // Stride covers the width of the new input buffer, which includes tile width and overlap - input_stride *= rect_size.x; - - TileInfo *tile_info = tile_info_mem.alloc(1); - for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { - tile_info->offsets[i] = neighbors.tiles[i].offset; - tile_info->strides[i] = neighbors.tiles[i].stride; - tile_info->buffers[i] = neighbors.tiles[i].buffer; - } - tile_info->x[0] = neighbors.tiles[3].x; - tile_info->x[1] = neighbors.tiles[4].x; - tile_info->x[2] = neighbors.tiles[5].x; - tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; - tile_info->y[0] = neighbors.tiles[1].y; - tile_info->y[1] = neighbors.tiles[4].y; - tile_info->y[2] = neighbors.tiles[7].y; - tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; - tile_info_mem.copy_to_device(); - - void *args[] = { - &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride}; - launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args); - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - device_only_memory<float> input_rgb(this, "denoiser input rgb", true); - input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); - - void *input_args[] = {&input_rgb.device_pointer, - &input_ptr, - &rect_size.x, - &rect_size.y, - &input_stride, - &task.pass_stride, - const_cast<int *>(pass_offset), - &task.denoising.input_passes, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); - - input_ptr = input_rgb.device_pointer; - pixel_stride = 3 * sizeof(float); - input_stride = rect_size.x * pixel_stride; -# endif - - const bool recreate_denoiser = (denoiser == NULL) || - (task.denoising.input_passes != denoiser_input_passes); - if (recreate_denoiser) { - // Destroy existing handle before creating new one - if (denoiser != NULL) { - optixDenoiserDestroy(denoiser); - } - - // Create OptiX denoiser handle on demand when it is first used - OptixDenoiserOptions denoiser_options = {}; - assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3); -# if OPTIX_ABI_VERSION >= 47 - denoiser_options.guideAlbedo = task.denoising.input_passes >= 2; - denoiser_options.guideNormal = task.denoising.input_passes >= 3; - check_result_optix_ret(optixDenoiserCreate( - context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser)); -# else - denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>( - OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1)); -# if OPTIX_ABI_VERSION < 28 - denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3; -# endif - check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser)); - check_result_optix_ret( - optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0)); -# endif - - // OptiX denoiser handle was created with the requested number of input passes - denoiser_input_passes = task.denoising.input_passes; - } - - OptixDenoiserSizes sizes = {}; - check_result_optix_ret( - optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); - -# if OPTIX_ABI_VERSION < 28 - const size_t scratch_size = sizes.recommendedScratchSizeInBytes; -# else - const size_t scratch_size = sizes.withOverlapScratchSizeInBytes; -# endif - const size_t scratch_offset = sizes.stateSizeInBytes; - - // Allocate denoiser state if tile size has changed since last setup - if (recreate_denoiser || (denoiser_state.data_width != rect_size.x || - denoiser_state.data_height != rect_size.y)) { - denoiser_state.alloc_to_device(scratch_offset + scratch_size); - - // Initialize denoiser state for the current tile size - check_result_optix_ret(optixDenoiserSetup(denoiser, - 0, - rect_size.x, - rect_size.y, - denoiser_state.device_pointer, - scratch_offset, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); - - denoiser_state.data_width = rect_size.x; - denoiser_state.data_height = rect_size.y; - } - - // Set up input and output layer information - OptixImage2D input_layers[3] = {}; - OptixImage2D output_layers[1] = {}; - - for (int i = 0; i < 3; ++i) { -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i); -# else - input_layers[i].data = input_ptr + pass_offset[i]; -# endif - input_layers[i].width = rect_size.x; - input_layers[i].height = rect_size.y; - input_layers[i].rowStrideInBytes = input_stride; - input_layers[i].pixelStrideInBytes = pixel_stride; - input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3; - } - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - output_layers[0].data = input_ptr; - output_layers[0].width = rect_size.x; - output_layers[0].height = rect_size.y; - output_layers[0].rowStrideInBytes = input_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; - int2 output_offset = overlap_offset; - overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually -# else - output_layers[0].data = target_tile.buffer + pixel_offset; - output_layers[0].width = target_tile.w; - output_layers[0].height = target_tile.h; - output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride; - output_layers[0].pixelStrideInBytes = pixel_stride; -# endif - output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; - -# if OPTIX_ABI_VERSION >= 47 - OptixDenoiserLayer image_layers = {}; - image_layers.input = input_layers[0]; - image_layers.output = output_layers[0]; - - OptixDenoiserGuideLayer guide_layers = {}; - guide_layers.albedo = input_layers[1]; - guide_layers.normal = input_layers[2]; -# endif - - // Finally run denonising - OptixDenoiserParams params = {}; // All parameters are disabled/zero -# if OPTIX_ABI_VERSION >= 47 - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - &guide_layers, - &image_layers, - 1, - overlap_offset.x, - overlap_offset.y, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# else - check_result_optix_ret(optixDenoiserInvoke(denoiser, - NULL, - ¶ms, - denoiser_state.device_pointer, - scratch_offset, - input_layers, - task.denoising.input_passes, - overlap_offset.x, - overlap_offset.y, - output_layers, - denoiser_state.device_pointer + scratch_offset, - scratch_size)); -# endif - -# if OPTIX_DENOISER_NO_PIXEL_STRIDE - void *output_args[] = {&input_ptr, - &target_tile.buffer, - &output_offset.x, - &output_offset.y, - &rect_size.x, - &rect_size.y, - &target_tile.x, - &target_tile.y, - &target_tile.w, - &target_tile.h, - &target_tile.offset, - &target_tile.stride, - &task.pass_stride, - &rtile.sample}; - launch_filter_kernel( - "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args); -# endif - - check_result_cuda_ret(cuStreamSynchronize(0)); - - task.unmap_neighbor_tiles(neighbors, this); - } - else { - // Run CUDA denoising kernels - DenoisingTask denoising(this, task); - CUDADevice::denoise(rtile, denoising); - } - - // Update task progress after the denoiser completed processing - task.update_progress(&rtile, rtile.w * rtile.h); - - return true; - } - - void launch_shader_eval(DeviceTask &task, int thread_index) - { - unsigned int rgen_index = PG_BACK; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) - rgen_index = PG_BAKE; - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) - rgen_index = PG_DISP; - - const CUDAContextScope scope(cuContext); - - device_ptr launch_params_ptr = launch_params.device_pointer + - thread_index * launch_params.data_elements; - - for (int sample = 0; sample < task.num_samples; ++sample) { - ShaderParams params; - params.input = (uint4 *)task.shader_input; - params.output = (float4 *)task.shader_output; - params.type = task.shader_eval_type; - params.filter = task.shader_filter; - params.sx = task.shader_x; - params.offset = task.offset; - params.sample = sample; - - check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader), - ¶ms, - sizeof(params), - cuda_stream[thread_index])); - - OptixShaderBindingTable sbt_params = {}; - sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord); - sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord); - sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.missRecordCount = 1; - sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); - sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); -# if OPTIX_ABI_VERSION >= 36 - sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL -# else - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL -# endif - sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord); - sbt_params.callablesRecordCount = 3; - sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); - - check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], - cuda_stream[thread_index], - launch_params_ptr, - launch_params.data_elements, - &sbt_params, - task.shader_w, - 1, - 1)); - - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - task.update_progress(NULL); - } - } - - bool build_optix_bvh(BVHOptiX *bvh, - OptixBuildOperation operation, - const OptixBuildInput &build_input, - uint16_t num_motion_steps) - { - /* Allocate and build acceleration structures only one at a time, to prevent parallel builds - * from running out of memory (since both original and compacted acceleration structure memory - * may be allocated at the same time for the duration of this function). The builds would - * otherwise happen on the same CUDA stream anyway. */ - static thread_mutex mutex; - thread_scoped_lock lock(mutex); - - const CUDAContextScope scope(cuContext); - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - // Compute memory usage - OptixAccelBufferSizes sizes = {}; - OptixAccelBuildOptions options = {}; - options.operation = operation; - if (use_fast_trace_bvh) { - VLOG(2) << "Using fast to trace OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; - } - else { - VLOG(2) << "Using fast to update OptiX BVH"; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; - } - - options.motionOptions.numKeys = num_motion_steps; - options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; - options.motionOptions.timeBegin = 0.0f; - options.motionOptions.timeEnd = 1.0f; - - check_result_optix_ret( - optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); - - // Allocate required output buffers - device_only_memory<char> temp_mem(this, "optix temp as build mem", true); - temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); - if (!temp_mem.device_pointer) - return false; // Make sure temporary memory allocation succeeded - - // Acceleration structure memory has to be allocated on the device (not allowed to be on host) - device_only_memory<char> &out_data = bvh->as_data; - if (operation == OPTIX_BUILD_OPERATION_BUILD) { - assert(out_data.device == this); - out_data.alloc_to_device(sizes.outputSizeInBytes); - if (!out_data.device_pointer) - return false; - } - else { - assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes); - } - - // Finally build the acceleration structure - OptixAccelEmitDesc compacted_size_prop = {}; - compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; - // A tiny space was allocated for this property at the end of the temporary buffer above - // Make sure this pointer is 8-byte aligned - compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); - - OptixTraversableHandle out_handle = 0; - check_result_optix_ret(optixAccelBuild(context, - NULL, - &options, - &build_input, - 1, - temp_mem.device_pointer, - sizes.tempSizeInBytes, - out_data.device_pointer, - sizes.outputSizeInBytes, - &out_handle, - use_fast_trace_bvh ? &compacted_size_prop : NULL, - use_fast_trace_bvh ? 1 : 0)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for all operations to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - // Compact acceleration structure to save memory (only if using fast trace as the - // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case). - if (use_fast_trace_bvh) { - uint64_t compacted_size = sizes.outputSizeInBytes; - check_result_cuda_ret( - cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); - - // Temporary memory is no longer needed, so free it now to make space - temp_mem.free(); - - // There is no point compacting if the size does not change - if (compacted_size < sizes.outputSizeInBytes) { - device_only_memory<char> compacted_data(this, "optix compacted as", false); - compacted_data.alloc_to_device(compacted_size); - if (!compacted_data.device_pointer) - // Do not compact if memory allocation for compacted acceleration structure fails - // Can just use the uncompacted one then, so succeed here regardless - return true; - - check_result_optix_ret(optixAccelCompact(context, - NULL, - out_handle, - compacted_data.device_pointer, - compacted_size, - &out_handle)); - bvh->traversable_handle = static_cast<uint64_t>(out_handle); - - // Wait for compaction to finish - check_result_cuda_ret(cuStreamSynchronize(NULL)); - - std::swap(out_data.device_size, compacted_data.device_size); - std::swap(out_data.device_pointer, compacted_data.device_pointer); - // Original acceleration structure memory is freed when 'compacted_data' goes out of scope - } - } - - return true; - } - - void build_bvh(BVH *bvh, Progress &progress, bool refit) override - { - if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) { - /* For baking CUDA is used, build appropriate BVH for that. */ - Device::build_bvh(bvh, progress, refit); - return; - } - - const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC); - - free_bvh_memory_delayed(); - - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - progress.set_substatus("Building OptiX acceleration structure"); - - if (!bvh->params.top_level) { - assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1); - - OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD; - /* Refit is only possible when using fast to trace BVH (because AS is built with - * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */ - if (refit && !use_fast_trace_bvh) { - assert(bvh_optix->traversable_handle != 0); - operation = OPTIX_BUILD_OPERATION_UPDATE; - } - else { - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - } - - // Build bottom level acceleration structures (BLAS) - Geometry *const geom = bvh->geometry[0]; - if (geom->geometry_type == Geometry::HAIR) { - // Build BLAS for curve primitives - Hair *const hair = static_cast<Hair *const>(geom); - if (hair->num_curves() == 0) { - return; - } - - const size_t num_segments = hair->num_segments(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && hair->get_use_motion_blur() && motion_keys) { - num_motion_steps = hair->get_motion_steps(); - } - - device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); -# if OPTIX_ABI_VERSION >= 36 - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - // Four control points for each curve segment - const size_t num_vertices = num_segments * 4; - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - index_data.alloc(num_segments); - vertex_data.alloc(num_vertices * num_motion_steps); - } - else -# endif - aabb_data.alloc(num_segments * num_motion_steps); - - // Get AABBs for each motion step - for (size_t step = 0; step < num_motion_steps; ++step) { - // The center step for motion vertices is not stored in the attribute - const float3 *keys = hair->get_curve_keys().data(); - size_t center_step = (num_motion_steps - 1) / 2; - if (step != center_step) { - size_t attr_offset = (step > center_step) ? step - 1 : step; - // Technically this is a float4 array, but sizeof(float3) == sizeof(float4) - keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); - } - - for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { - const Hair::Curve curve = hair->get_curve(j); -# if OPTIX_ABI_VERSION >= 36 - const array<float> &curve_radius = hair->get_curve_radius(); -# endif - - for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - int k0 = curve.first_key + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, curve.first_key); - int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); - - const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); - const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); - const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); - const float4 pw = make_float4( - curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); - - // Convert Catmull-Rom data to Bezier spline - static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; - static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; - static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; - static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; - - index_data[i] = i * 4; - float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; - v[0] = make_float4( - dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); - v[1] = make_float4( - dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); - v[2] = make_float4( - dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); - v[3] = make_float4( - dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); - } - else -# endif - { - BoundBox bounds = BoundBox::empty; - curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); - - const size_t index = step * num_segments + i; - aabb_data[index].minX = bounds.min.x; - aabb_data[index].minY = bounds.min.y; - aabb_data[index].minZ = bounds.min.z; - aabb_data[index].maxX = bounds.max.x; - aabb_data[index].maxY = bounds.max.y; - aabb_data[index].maxZ = bounds.max.z; - } - } - } - } - - // Upload AABB data to GPU - aabb_data.copy_to_device(); -# if OPTIX_ABI_VERSION >= 36 - index_data.copy_to_device(); - vertex_data.copy_to_device(); -# endif - - vector<device_ptr> aabb_ptrs; - aabb_ptrs.reserve(num_motion_steps); -# if OPTIX_ABI_VERSION >= 36 - vector<device_ptr> width_ptrs; - vector<device_ptr> vertex_ptrs; - width_ptrs.reserve(num_motion_steps); - vertex_ptrs.reserve(num_motion_steps); -# endif - for (size_t step = 0; step < num_motion_steps; ++step) { - aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); -# if OPTIX_ABI_VERSION >= 36 - const device_ptr base_ptr = vertex_data.device_pointer + - step * num_vertices * sizeof(float4); - width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size - vertex_ptrs.push_back(base_ptr); -# endif - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; -# if OPTIX_ABI_VERSION >= 36 - if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { - build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; - build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; - build_input.curveArray.numPrimitives = num_segments; - build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.curveArray.numVertices = num_vertices; - build_input.curveArray.vertexStrideInBytes = sizeof(float4); - build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); - build_input.curveArray.widthStrideInBytes = sizeof(float4); - build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; - build_input.curveArray.indexStrideInBytes = sizeof(int); - build_input.curveArray.flag = build_flags; - build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; - } - else -# endif - { - // Disable visibility test any-hit program, since it is already checked during - // intersection. Those trace calls that require anyhit can force it with a ray flag. - build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; - - build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; -# if OPTIX_ABI_VERSION < 23 - build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.aabbArray.numPrimitives = num_segments; - build_input.aabbArray.strideInBytes = sizeof(OptixAabb); - build_input.aabbArray.flags = &build_flags; - build_input.aabbArray.numSbtRecords = 1; - build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; -# else - build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.customPrimitiveArray.numPrimitives = num_segments; - build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); - build_input.customPrimitiveArray.flags = &build_flags; - build_input.customPrimitiveArray.numSbtRecords = 1; - build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; -# endif - } - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { - // Build BLAS for triangle primitives - Mesh *const mesh = static_cast<Mesh *const>(geom); - if (mesh->num_triangles() == 0) { - return; - } - - const size_t num_verts = mesh->get_verts().size(); - - size_t num_motion_steps = 1; - Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { - num_motion_steps = mesh->get_motion_steps(); - } - - device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); - index_data.alloc(mesh->get_triangles().size()); - memcpy(index_data.data(), - mesh->get_triangles().data(), - mesh->get_triangles().size() * sizeof(int)); - device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); - vertex_data.alloc(num_verts * num_motion_steps); - - for (size_t step = 0; step < num_motion_steps; ++step) { - const float3 *verts = mesh->get_verts().data(); - - size_t center_step = (num_motion_steps - 1) / 2; - // The center step for motion vertices is not stored in the attribute - if (step != center_step) { - verts = motion_keys->data_float3() + - (step > center_step ? step - 1 : step) * num_verts; - } - - memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); - } - - // Upload triangle data to GPU - index_data.copy_to_device(); - vertex_data.copy_to_device(); - - vector<device_ptr> vertex_ptrs; - vertex_ptrs.reserve(num_motion_steps); - for (size_t step = 0; step < num_motion_steps; ++step) { - vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); - } - - // Force a single any-hit call, so shadow record-all behavior works correctly - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; - build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); - build_input.triangleArray.numVertices = num_verts; - build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; - build_input.triangleArray.vertexStrideInBytes = sizeof(float3); - build_input.triangleArray.indexBuffer = index_data.device_pointer; - build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); - build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; - build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); - build_input.triangleArray.flags = &build_flags; - // The SBT does not store per primitive data since Cycles already allocates separate - // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in - // one and rely on that having the same meaning in this case. - build_input.triangleArray.numSbtRecords = 1; - build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; - - if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - } - } - else { - unsigned int num_instances = 0; - unsigned int max_num_instances = 0xFFFFFFFF; - - bvh_optix->as_data.free(); - bvh_optix->traversable_handle = 0; - bvh_optix->motion_transform_data.free(); - - optixDeviceContextGetProperty(context, - OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, - &max_num_instances, - sizeof(max_num_instances)); - // Do not count first bit, which is used to distinguish instanced and non-instanced objects - max_num_instances >>= 1; - if (bvh->objects.size() > max_num_instances) { - progress.set_error( - "Failed to build OptiX acceleration structure because there are too many instances"); - return; - } - - // Fill instance descriptions -# if OPTIX_ABI_VERSION < 41 - device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY); - aabbs.alloc(bvh->objects.size()); -# endif - device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY); - instances.alloc(bvh->objects.size()); - - // Calculate total motion transform size and allocate memory for them - size_t motion_transform_offset = 0; - if (motion_blur) { - size_t total_motion_transform_size = 0; - for (Object *const ob : bvh->objects) { - if (ob->is_traceable() && ob->use_motion()) { - total_motion_transform_size = align_up(total_motion_transform_size, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - const size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - total_motion_transform_size = total_motion_transform_size + - sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - } - } - - assert(bvh_optix->motion_transform_data.device == this); - bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); - } - - for (Object *ob : bvh->objects) { - // Skip non-traceable objects - if (!ob->is_traceable()) - continue; - - BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); - OptixTraversableHandle handle = blas->traversable_handle; - -# if OPTIX_ABI_VERSION < 41 - OptixAabb &aabb = aabbs[num_instances]; - aabb.minX = ob->bounds.min.x; - aabb.minY = ob->bounds.min.y; - aabb.minZ = ob->bounds.min.z; - aabb.maxX = ob->bounds.max.x; - aabb.maxY = ob->bounds.max.y; - aabb.maxZ = ob->bounds.max.z; -# endif - - OptixInstance &instance = instances[num_instances++]; - memset(&instance, 0, sizeof(instance)); - - // Clear transform to identity matrix - instance.transform[0] = 1.0f; - instance.transform[5] = 1.0f; - instance.transform[10] = 1.0f; - - // Set user instance ID to object index (but leave low bit blank) - instance.instanceId = ob->get_device_index() << 1; - - // Have to have at least one bit in the mask, or else instance would always be culled - instance.visibilityMask = 1; - - if (ob->get_geometry()->has_volume) { - // Volumes have a special bit set in the visibility mask so a trace can mask only volumes - instance.visibilityMask |= 2; - } - - if (ob->get_geometry()->geometry_type == Geometry::HAIR) { - // Same applies to curves (so they can be skipped in local trace calls) - instance.visibilityMask |= 4; - -# if OPTIX_ABI_VERSION >= 36 - if (motion_blur && ob->get_geometry()->has_motion_blur() && - DebugFlags().optix.curves_api && - static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { - // Select between motion blur and non-motion blur built-in intersection module - instance.sbtOffset = PG_HITD_MOTION - PG_HITD; - } -# endif - } - - // Insert motion traversable if object has motion - if (motion_blur && ob->use_motion()) { - size_t motion_keys = max(ob->get_motion().size(), 2) - 2; - size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + - motion_keys * sizeof(OptixSRTData); - - const CUDAContextScope scope(cuContext); - - motion_transform_offset = align_up(motion_transform_offset, - OPTIX_TRANSFORM_BYTE_ALIGNMENT); - CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + - motion_transform_offset; - motion_transform_offset += motion_transform_size; - - // Allocate host side memory for motion transform and fill it with transform data - OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( - new uint8_t[motion_transform_size]); - motion_transform.child = handle; - motion_transform.motionOptions.numKeys = ob->get_motion().size(); - motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; - motion_transform.motionOptions.timeBegin = 0.0f; - motion_transform.motionOptions.timeEnd = 1.0f; - - OptixSRTData *const srt_data = motion_transform.srtData; - array<DecomposedTransform> decomp(ob->get_motion().size()); - transform_motion_decompose( - decomp.data(), ob->get_motion().data(), ob->get_motion().size()); - - for (size_t i = 0; i < ob->get_motion().size(); ++i) { - // Scale - srt_data[i].sx = decomp[i].y.w; // scale.x.x - srt_data[i].sy = decomp[i].z.w; // scale.y.y - srt_data[i].sz = decomp[i].w.w; // scale.z.z - - // Shear - srt_data[i].a = decomp[i].z.x; // scale.x.y - srt_data[i].b = decomp[i].z.y; // scale.x.z - srt_data[i].c = decomp[i].w.x; // scale.y.z - assert(decomp[i].z.z == 0.0f); // scale.y.x - assert(decomp[i].w.y == 0.0f); // scale.z.x - assert(decomp[i].w.z == 0.0f); // scale.z.y - - // Pivot point - srt_data[i].pvx = 0.0f; - srt_data[i].pvy = 0.0f; - srt_data[i].pvz = 0.0f; - - // Rotation - srt_data[i].qx = decomp[i].x.x; - srt_data[i].qy = decomp[i].x.y; - srt_data[i].qz = decomp[i].x.z; - srt_data[i].qw = decomp[i].x.w; - - // Translation - srt_data[i].tx = decomp[i].y.x; - srt_data[i].ty = decomp[i].y.y; - srt_data[i].tz = decomp[i].y.z; - } - - // Upload motion transform to GPU - cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); - delete[] reinterpret_cast<uint8_t *>(&motion_transform); - - // Disable instance transform if object uses motion transform already - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - - // Get traversable handle to motion transform - optixConvertPointerToTraversableHandle(context, - motion_transform_gpu, - OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, - &instance.traversableHandle); - } - else { - instance.traversableHandle = handle; - - if (ob->get_geometry()->is_instanced()) { - // Set transform matrix - memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); - } - else { - // Disable instance transform if geometry already has it applied to vertex data - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - // Non-instanced objects read ID from 'prim_object', so distinguish - // them from instanced objects with the low bit set - instance.instanceId |= 1; - } - } - } - - // Upload instance descriptions -# if OPTIX_ABI_VERSION < 41 - aabbs.resize(num_instances); - aabbs.copy_to_device(); -# endif - instances.resize(num_instances); - instances.copy_to_device(); - - // Build top-level acceleration structure (TLAS) - OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; -# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2 - build_input.instanceArray.aabbs = aabbs.device_pointer; - build_input.instanceArray.numAabbs = num_instances; -# endif - build_input.instanceArray.instances = instances.device_pointer; - build_input.instanceArray.numInstances = num_instances; - - if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) { - progress.set_error("Failed to build OptiX acceleration structure"); - } - tlas_handle = bvh_optix->traversable_handle; - } - } - - void release_optix_bvh(BVH *bvh) override - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - /* Do delayed free of BVH memory, since geometry holding BVH might be deleted - * while GPU is still rendering. */ - BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); - - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data)); - delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data)); - bvh_optix->traversable_handle = 0; - } - - void free_bvh_memory_delayed() - { - thread_scoped_lock lock(delayed_free_bvh_mutex); - delayed_free_bvh_memory.free_memory(); - } - - void const_copy_to(const char *name, void *host, size_t size) override - { - // Set constant memory for CUDA module - // TODO(pmours): This is only used for tonemapping (see 'film_convert'). - // Could be removed by moving those functions to filter CUDA module. - CUDADevice::const_copy_to(name, host, size); - - if (strcmp(name, "__data") == 0) { - assert(size <= sizeof(KernelData)); - - // Update traversable handle (since it is different for each device on multi devices) - KernelData *const data = (KernelData *)host; - *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - - update_launch_params(offsetof(KernelParams, data), host, size); - return; - } - - // Update data storage pointers in launch parameters -# define KERNEL_TEX(data_type, tex_name) \ - if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(offsetof(KernelParams, tex_name), host, size); \ - return; \ - } -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - } - - void update_launch_params(size_t offset, void *data, size_t data_size) - { - const CUDAContextScope scope(cuContext); - - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda( - cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, - data, - data_size)); - } - - void task_add(DeviceTask &task) override - { - // Upload texture information to device if it has changed since last launch - load_texture_info(); - - if (task.type == DeviceTask::FILM_CONVERT) { - // Execute in main thread because of OpenGL access - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - return; - } - - if (task.type == DeviceTask::DENOISE_BUFFER) { - // Execute denoising in a single thread (e.g. to avoid race conditions during creation) - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy, 0); - }); - return; - } - - // Split task into smaller ones - list<DeviceTask> tasks; - task.split(tasks, info.cpu_threads); - - // Queue tasks in internal task pool - int task_index = 0; - for (DeviceTask &task : tasks) { - task_pool.push([=] { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - DeviceTask task_copy = task; - thread_run(task_copy, task_index); - }); - task_index++; - } - } - - void task_wait() override - { - // Wait for all queued tasks to finish - task_pool.wait_work(); - } - - void task_cancel() override - { - // Cancel any remaining tasks in the internal pool - task_pool.cancel(); - } -}; - -bool device_optix_init() -{ - if (g_optixFunctionTable.optixDeviceContextCreate != NULL) - return true; // Already initialized function table - - // Need to initialize CUDA as well - if (!device_cuda_init()) - return false; - - const OptixResult result = optixInit(); - - if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { - VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " - "Please update to the latest driver first!"; - return false; - } - else if (result != OPTIX_SUCCESS) { - VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; - return false; - } - - // Loaded OptiX successfully! - return true; -} - -void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices) -{ - devices.reserve(cuda_devices.size()); - - // Simply add all supported CUDA devices as OptiX devices again - for (DeviceInfo info : cuda_devices) { - assert(info.type == DEVICE_CUDA); - - int major; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); - if (major < 5) { - continue; // Only Maxwell and up are supported by OptiX - } - - info.type = DEVICE_OPTIX; - info.id += "_OptiX"; - info.denoisers |= DENOISER_OPTIX; - info.has_branched_path = false; - - devices.push_back(info); - } -} - -Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) -{ - return new OptiXDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp new file mode 100644 index 00000000000..a89ba68d62c --- /dev/null +++ b/intern/cycles/device/device_queue.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_queue.h" + +#include "util/util_algorithm.h" +#include "util/util_logging.h" +#include "util/util_time.h" + +#include <iomanip> + +CCL_NAMESPACE_BEGIN + +DeviceQueue::DeviceQueue(Device *device) + : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0) +{ + DCHECK_NE(device, nullptr); +} + +DeviceQueue::~DeviceQueue() +{ + if (VLOG_IS_ON(3)) { + /* Print kernel execution times sorted by time. */ + vector<pair<DeviceKernelMask, double>> stats_sorted; + for (const auto &stat : stats_kernel_time_) { + stats_sorted.push_back(stat); + } + + sort(stats_sorted.begin(), + stats_sorted.end(), + [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) { + return a.second > b.second; + }); + + VLOG(3) << "GPU queue stats:"; + for (const auto &[mask, time] : stats_sorted) { + VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5) + << std::right << time << "s: " << device_kernel_mask_as_string(mask); + } + } +} + +void DeviceQueue::debug_init_execution() +{ + if (VLOG_IS_ON(3)) { + last_sync_time_ = time_dt(); + last_kernels_enqueued_ = 0; + } +} + +void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) +{ + if (VLOG_IS_ON(3)) { + VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; + last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); + } +} + +void DeviceQueue::debug_synchronize() +{ + if (VLOG_IS_ON(3)) { + const double new_time = time_dt(); + const double elapsed_time = new_time - last_sync_time_; + VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s"; + + stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; + + last_sync_time_ = new_time; + last_kernels_enqueued_ = 0; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h new file mode 100644 index 00000000000..edda3e61d51 --- /dev/null +++ b/intern/cycles/device/device_queue.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_kernel.h" + +#include "device/device_graphics_interop.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class device_memory; + +struct KernelWorkTile; + +/* Abstraction of a command queue for a device. + * Provides API to schedule kernel execution in a specific queue with minimal possible overhead + * from driver side. + * + * This class encapsulates all properties needed for commands execution. */ +class DeviceQueue { + public: + virtual ~DeviceQueue(); + + /* Number of concurrent states to process for integrator, + * based on number of cores and/or available memory. */ + virtual int num_concurrent_states(const size_t state_size) const = 0; + + /* Number of states which keeps the device occupied with work without loosing performance. + * The renderer will add more work (when available) when number of active paths falls below this + * value. */ + virtual int num_concurrent_busy_states() const = 0; + + /* Initialize execution of kernels on this queue. + * + * Will, for example, load all data required by the kernels from Device to global or path state. + * + * Use this method after device synchronization has finished before enqueueing any kernels. */ + virtual void init_execution() = 0; + + /* Test if an optional device kernel is available. */ + virtual bool kernel_available(DeviceKernel kernel) const = 0; + + /* Enqueue kernel execution. + * + * Execute the kernel work_size times on the device. + * Supported arguments types: + * - int: pass pointer to the int + * - device memory: pass pointer to device_memory.device_pointer + * Return false if there was an error executing this or a previous kernel. */ + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0; + + /* Wait unit all enqueued kernels have finished execution. + * Return false if there was an error executing any of the enqueued kernels. */ + virtual bool synchronize() = 0; + + /* Copy memory to/from device as part of the command queue, to ensure + * operations are done in order without having to synchronize. */ + virtual void zero_to_device(device_memory &mem) = 0; + virtual void copy_to_device(device_memory &mem) = 0; + virtual void copy_from_device(device_memory &mem) = 0; + + /* Graphics resources interoperability. + * + * The interoperability comes here by the meaning that the device is capable of computing result + * directly into an OpenGL (or other graphics library) buffer. */ + + /* Create graphics interoperability context which will be taking care of mapping graphics + * resource as a buffer writable by kernels of this device. */ + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() + { + LOG(FATAL) << "Request of GPU interop of a device which does not support it."; + return nullptr; + } + + /* Device this queue has been created for. */ + Device *device; + + protected: + /* Hide construction so that allocation via `Device` API is enforced. */ + explicit DeviceQueue(Device *device); + + /* Implementations call these from the corresponding methods to generate debugging logs. */ + void debug_init_execution(); + void debug_enqueue(DeviceKernel kernel, const int work_size); + void debug_synchronize(); + + /* Combination of kernels enqueued together sync last synchronize. */ + DeviceKernelMask last_kernels_enqueued_; + /* Time of synchronize call. */ + double last_sync_time_; + /* Accumulated execution time for combinations of kernels launched together. */ + map<DeviceKernelMask, double> stats_kernel_time_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp deleted file mode 100644 index 9889f688aaa..00000000000 --- a/intern/cycles/device/device_split_kernel.cpp +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "device/device_split_kernel.h" - -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data_types.h" - -#include "util/util_logging.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -static const double alpha = 0.1; /* alpha for rolling average */ - -DeviceSplitKernel::DeviceSplitKernel(Device *device) - : device(device), - split_data(device, "split_data"), - ray_state(device, "ray_state", MEM_READ_WRITE), - queue_index(device, "queue_index"), - use_queues_flag(device, "use_queues_flag"), - work_pool_wgs(device, "work_pool_wgs"), - kernel_data_initialized(false) -{ - avg_time_per_sample = 0.0; - - kernel_path_init = NULL; - kernel_scene_intersect = NULL; - kernel_lamp_emission = NULL; - kernel_do_volume = NULL; - kernel_queue_enqueue = NULL; - kernel_indirect_background = NULL; - kernel_shader_setup = NULL; - kernel_shader_sort = NULL; - kernel_shader_eval = NULL; - kernel_holdout_emission_blurring_pathtermination_ao = NULL; - kernel_subsurface_scatter = NULL; - kernel_direct_lighting = NULL; - kernel_shadow_blocked_ao = NULL; - kernel_shadow_blocked_dl = NULL; - kernel_enqueue_inactive = NULL; - kernel_next_iteration_setup = NULL; - kernel_indirect_subsurface = NULL; - kernel_buffer_update = NULL; - kernel_adaptive_stopping = NULL; - kernel_adaptive_filter_x = NULL; - kernel_adaptive_filter_y = NULL; - kernel_adaptive_adjust_samples = NULL; -} - -DeviceSplitKernel::~DeviceSplitKernel() -{ - split_data.free(); - ray_state.free(); - use_queues_flag.free(); - queue_index.free(); - work_pool_wgs.free(); - - delete kernel_path_init; - delete kernel_scene_intersect; - delete kernel_lamp_emission; - delete kernel_do_volume; - delete kernel_queue_enqueue; - delete kernel_indirect_background; - delete kernel_shader_setup; - delete kernel_shader_sort; - delete kernel_shader_eval; - delete kernel_holdout_emission_blurring_pathtermination_ao; - delete kernel_subsurface_scatter; - delete kernel_direct_lighting; - delete kernel_shadow_blocked_ao; - delete kernel_shadow_blocked_dl; - delete kernel_enqueue_inactive; - delete kernel_next_iteration_setup; - delete kernel_indirect_subsurface; - delete kernel_buffer_update; - delete kernel_adaptive_stopping; - delete kernel_adaptive_filter_x; - delete kernel_adaptive_filter_y; - delete kernel_adaptive_adjust_samples; -} - -bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features) -{ -#define LOAD_KERNEL(name) \ - kernel_##name = get_split_kernel_function(#name, requested_features); \ - if (!kernel_##name) { \ - device->set_error(string("Split kernel error: failed to load kernel_") + #name); \ - return false; \ - } - - LOAD_KERNEL(path_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - if (requested_features.use_volume) { - LOAD_KERNEL(do_volume); - } - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(indirect_background); - LOAD_KERNEL(shader_setup); - LOAD_KERNEL(shader_sort); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(subsurface_scatter); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked_ao); - LOAD_KERNEL(shadow_blocked_dl); - LOAD_KERNEL(enqueue_inactive); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(indirect_subsurface); - LOAD_KERNEL(buffer_update); - LOAD_KERNEL(adaptive_stopping); - LOAD_KERNEL(adaptive_filter_x); - LOAD_KERNEL(adaptive_filter_y); - LOAD_KERNEL(adaptive_adjust_samples); - -#undef LOAD_KERNEL - - /* Re-initialiaze kernel-dependent data when kernels change. */ - kernel_data_initialized = false; - - return true; -} - -size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg, - device_memory &data, - uint64_t max_buffer_size) -{ - uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; - VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element) - << " bytes. (" << string_human_readable_size(size_per_element) << ")."; - return max_buffer_size / size_per_element; -} - -bool DeviceSplitKernel::path_trace(DeviceTask &task, - RenderTile &tile, - device_memory &kgbuffer, - device_memory &kernel_data) -{ - if (device->have_error()) { - return false; - } - - /* Allocate all required global memory once. */ - if (!kernel_data_initialized) { - kernel_data_initialized = true; - - /* Set local size */ - int2 lsize = split_kernel_local_size(); - local_size[0] = lsize[0]; - local_size[1] = lsize[1]; - - /* Set global size */ - int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); - - /* Make sure that set work size is a multiple of local - * work size dimensions. - */ - global_size[0] = round_up(gsize[0], local_size[0]); - global_size[1] = round_up(gsize[1], local_size[1]); - - int num_global_elements = global_size[0] * global_size[1]; - assert(num_global_elements % WORK_POOL_SIZE == 0); - - /* Calculate max groups */ - - /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ - unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : - WORK_POOL_SIZE_GPU; - unsigned int max_work_groups = num_global_elements / work_pool_size + 1; - - /* Allocate work_pool_wgs memory. */ - work_pool_wgs.alloc_to_device(max_work_groups); - queue_index.alloc_to_device(NUM_QUEUES); - use_queues_flag.alloc_to_device(1); - split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); - ray_state.alloc(num_global_elements); - } - - /* Number of elements in the global state buffer */ - int num_global_elements = global_size[0] * global_size[1]; - -#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ - if (device->have_error()) { \ - return false; \ - } \ - if (!kernel_##name->enqueue( \ - KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ - return false; \ - } - - tile.sample = tile.start_sample; - - /* for exponential increase between tile updates */ - int time_multiplier = 1; - - while (tile.sample < tile.start_sample + tile.num_samples) { - /* to keep track of how long it takes to run a number of samples */ - double start_time = time_dt(); - - /* initial guess to start rolling average */ - const int initial_num_samples = 1; - /* approx number of samples per second */ - const int samples_per_second = (avg_time_per_sample > 0.0) ? - int(double(time_multiplier) / avg_time_per_sample) + 1 : - initial_num_samples; - - RenderTile subtile = tile; - subtile.start_sample = tile.sample; - subtile.num_samples = samples_per_second; - - if (task.adaptive_sampling.use) { - subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample, - subtile.num_samples); - } - - /* Don't go beyond requested number of samples. */ - subtile.num_samples = min(subtile.num_samples, - tile.start_sample + tile.num_samples - tile.sample); - - if (device->have_error()) { - return false; - } - - /* reset state memory here as global size for data_init - * kernel might not be large enough to do in kernel - */ - work_pool_wgs.zero_to_device(); - split_data.zero_to_device(); - ray_state.zero_to_device(); - - if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), - subtile, - num_global_elements, - kgbuffer, - kernel_data, - split_data, - ray_state, - queue_index, - use_queues_flag, - work_pool_wgs)) { - return false; - } - - ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); - - bool activeRaysAvailable = true; - double cancel_time = DBL_MAX; - - while (activeRaysAvailable) { - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for (int PathIter = 0; PathIter < 16; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - if (kernel_do_volume) { - ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); - } - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL( - holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - - if (task.get_cancel() && cancel_time == DBL_MAX) { - /* Wait up to twice as many seconds for current samples to finish - * to avoid artifacts in render result from ending too soon. - */ - cancel_time = time_dt() + 2.0 * time_multiplier; - } - - if (time_dt() > cancel_time) { - return true; - } - } - - /* Decide if we should exit path-iteration in host. */ - ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); - - activeRaysAvailable = false; - - for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { - if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { - if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { - /* Something went wrong, abort to avoid looping endlessly. */ - device->set_error("Split kernel error: invalid ray state"); - return false; - } - - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } - - if (time_dt() > cancel_time) { - return true; - } - } - - int filter_sample = tile.sample + subtile.num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - size_t buffer_size[2]; - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(tile.h, local_size[1]); - kernel_adaptive_stopping->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - buffer_size[0] = round_up(tile.h, local_size[0]); - buffer_size[1] = round_up(1, local_size[1]); - kernel_adaptive_filter_x->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(1, local_size[1]); - kernel_adaptive_filter_y->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - } - - double time_per_sample = ((time_dt() - start_time) / subtile.num_samples); - - if (avg_time_per_sample == 0.0) { - /* start rolling average */ - avg_time_per_sample = time_per_sample; - } - else { - avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample; - } - -#undef ENQUEUE_SPLIT_KERNEL - - tile.sample += subtile.num_samples; - task.update_progress(&tile, tile.w * tile.h * subtile.num_samples); - - time_multiplier = min(time_multiplier << 1, 10); - - if (task.get_cancel()) { - return true; - } - } - - if (task.adaptive_sampling.use) { - /* Reset the start samples. */ - RenderTile subtile = tile; - subtile.start_sample = tile.start_sample; - subtile.num_samples = tile.sample - tile.start_sample; - enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), - subtile, - num_global_elements, - kgbuffer, - kernel_data, - split_data, - ray_state, - queue_index, - use_queues_flag, - work_pool_wgs); - size_t buffer_size[2]; - buffer_size[0] = round_up(tile.w, local_size[0]); - buffer_size[1] = round_up(tile.h, local_size[1]); - kernel_adaptive_adjust_samples->enqueue( - KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); - } - - return true; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h deleted file mode 100644 index 07a21b10299..00000000000 --- a/intern/cycles/device/device_split_kernel.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_SPLIT_KERNEL_H__ -#define __DEVICE_SPLIT_KERNEL_H__ - -#include "device/device.h" -#include "render/buffers.h" - -CCL_NAMESPACE_BEGIN - -/* When allocate global memory in chunks. We may not be able to - * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; - * Since some bytes may be needed for aligning chunks of memory; - * This is the amount of memory that we dedicate for that purpose. - */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 // 5MB - -/* Types used for split kernel */ - -class KernelDimensions { - public: - size_t global_size[2]; - size_t local_size[2]; - - KernelDimensions(size_t global_size_[2], size_t local_size_[2]) - { - memcpy(global_size, global_size_, sizeof(global_size)); - memcpy(local_size, local_size_, sizeof(local_size)); - } -}; - -class SplitKernelFunction { - public: - virtual ~SplitKernelFunction() - { - } - - /* enqueue the kernel, returns false if there is an error */ - virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0; -}; - -class DeviceSplitKernel { - private: - Device *device; - - SplitKernelFunction *kernel_path_init; - SplitKernelFunction *kernel_scene_intersect; - SplitKernelFunction *kernel_lamp_emission; - SplitKernelFunction *kernel_do_volume; - SplitKernelFunction *kernel_queue_enqueue; - SplitKernelFunction *kernel_indirect_background; - SplitKernelFunction *kernel_shader_setup; - SplitKernelFunction *kernel_shader_sort; - SplitKernelFunction *kernel_shader_eval; - SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; - SplitKernelFunction *kernel_subsurface_scatter; - SplitKernelFunction *kernel_direct_lighting; - SplitKernelFunction *kernel_shadow_blocked_ao; - SplitKernelFunction *kernel_shadow_blocked_dl; - SplitKernelFunction *kernel_enqueue_inactive; - SplitKernelFunction *kernel_next_iteration_setup; - SplitKernelFunction *kernel_indirect_subsurface; - SplitKernelFunction *kernel_buffer_update; - SplitKernelFunction *kernel_adaptive_stopping; - SplitKernelFunction *kernel_adaptive_filter_x; - SplitKernelFunction *kernel_adaptive_filter_y; - SplitKernelFunction *kernel_adaptive_adjust_samples; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - device_only_memory<uchar> split_data; - device_vector<uchar> ray_state; - device_only_memory<int> - queue_index; /* Array of size num_queues that tracks the size of each queue. */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_only_memory<char> use_queues_flag; - - /* Approximate time it takes to complete one sample */ - double avg_time_per_sample; - - /* Work pool with respect to each work group. */ - device_only_memory<unsigned int> work_pool_wgs; - - /* Cached kernel-dependent data, initialized once. */ - bool kernel_data_initialized; - size_t local_size[2]; - size_t global_size[2]; - - public: - explicit DeviceSplitKernel(Device *device); - virtual ~DeviceSplitKernel(); - - bool load_kernels(const DeviceRequestedFeatures &requested_features); - bool path_trace(DeviceTask &task, - RenderTile &rtile, - device_memory &kgbuffer, - device_memory &kernel_data); - - virtual uint64_t state_buffer_size(device_memory &kg, - device_memory &data, - size_t num_threads) = 0; - size_t max_elements_for_max_buffer_size(device_memory &kg, - device_memory &data, - uint64_t max_buffer_size); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) = 0; - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) = 0; - virtual int2 split_kernel_local_size() = 0; - virtual int2 split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask &task) = 0; -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_SPLIT_KERNEL_H__ */ diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp deleted file mode 100644 index 55fbaa31e42..00000000000 --- a/intern/cycles/device/device_task.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <stdlib.h> -#include <string.h> - -#include "device/device_task.h" - -#include "render/buffers.h" - -#include "util/util_algorithm.h" -#include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -/* Device Task */ - -DeviceTask::DeviceTask(Type type_) - : type(type_), - x(0), - y(0), - w(0), - h(0), - rgba_byte(0), - rgba_half(0), - buffer(0), - sample(0), - num_samples(1), - shader_input(0), - shader_output(0), - shader_eval_type(0), - shader_filter(0), - shader_x(0), - shader_w(0), - buffers(nullptr), - tile_types(0), - denoising_from_render(false), - pass_stride(0), - frame_stride(0), - target_pass_stride(0), - pass_denoising_data(0), - pass_denoising_clean(0), - need_finish_queue(false), - integrator_branched(false) -{ - last_update_time = time_dt(); -} - -int DeviceTask::get_subtask_count(int num, int max_size) const -{ - if (max_size != 0) { - int max_size_num; - - if (type == SHADER) { - max_size_num = (shader_w + max_size - 1) / max_size; - } - else { - max_size = max(1, max_size / w); - max_size_num = (h + max_size - 1) / max_size; - } - - num = max(max_size_num, num); - } - - if (type == SHADER) { - num = min(shader_w, num); - } - else if (type == RENDER) { - } - else { - num = min(h, num); - } - - return num; -} - -void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const -{ - num = get_subtask_count(num, max_size); - - if (type == SHADER) { - for (int i = 0; i < num; i++) { - int tx = shader_x + (shader_w / num) * i; - int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num; - - DeviceTask task = *this; - - task.shader_x = tx; - task.shader_w = tw; - - tasks.push_back(task); - } - } - else if (type == RENDER) { - for (int i = 0; i < num; i++) - tasks.push_back(*this); - } - else { - for (int i = 0; i < num; i++) { - int ty = y + (h / num) * i; - int th = (i == num - 1) ? h - i * (h / num) : h / num; - - DeviceTask task = *this; - - task.y = ty; - task.h = th; - - tasks.push_back(task); - } - } -} - -void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) -{ - if (type == FILM_CONVERT) - return; - - if (update_progress_sample) { - if (pixel_samples == -1) { - pixel_samples = shader_w; - } - update_progress_sample(pixel_samples, rtile ? rtile->sample : 0); - } - - if (update_tile_sample) { - double current_time = time_dt(); - - if (current_time - last_update_time >= 1.0) { - update_tile_sample(*rtile); - - last_update_time = current_time; - } - } -} - -/* Adaptive Sampling */ - -AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0) -{ -} - -/* Render samples in steps that align with the adaptive filtering. */ -int AdaptiveSampling::align_samples(int sample, int num_samples) const -{ - int end_sample = sample + num_samples; - - /* Round down end sample to the nearest sample that needs filtering. */ - end_sample &= ~(adaptive_step - 1); - - if (end_sample <= sample) { - /* In order to reach the next sample that needs filtering, we'd need - * to increase num_samples. We don't do that in this function, so - * just keep it as is and don't filter this time around. */ - return num_samples; - } - return end_sample - sample; -} - -bool AdaptiveSampling::need_filter(int sample) const -{ - if (sample > min_samples) { - return (sample & (adaptive_step - 1)) == (adaptive_step - 1); - } - else { - return false; - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h deleted file mode 100644 index 3f7cf47b692..00000000000 --- a/intern/cycles/device/device_task.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DEVICE_TASK_H__ -#define __DEVICE_TASK_H__ - -#include "device/device_memory.h" - -#include "util/util_function.h" -#include "util/util_list.h" - -CCL_NAMESPACE_BEGIN - -/* Device Task */ - -class Device; -class RenderBuffers; -class RenderTile; -class RenderTileNeighbors; -class Tile; - -enum DenoiserType { - DENOISER_NLM = 1, - DENOISER_OPTIX = 2, - DENOISER_OPENIMAGEDENOISE = 4, - DENOISER_NUM, - - DENOISER_NONE = 0, - DENOISER_ALL = ~0, -}; - -enum DenoiserInput { - DENOISER_INPUT_RGB = 1, - DENOISER_INPUT_RGB_ALBEDO = 2, - DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3, - - DENOISER_INPUT_NUM, -}; - -typedef int DenoiserTypeMask; - -class DenoiseParams { - public: - /* Apply denoiser to image. */ - bool use; - /* Output denoising data passes (possibly without applying the denoiser). */ - bool store_passes; - - /* Denoiser type. */ - DenoiserType type; - - /* Viewport start sample. */ - int start_sample; - - /** Native Denoiser. */ - - /* Pixel radius for neighboring pixels to take into account. */ - int radius; - /* Controls neighbor pixel weighting for the denoising filter. */ - float strength; - /* Preserve more or less detail based on feature passes. */ - float feature_strength; - /* When removing pixels that don't carry information, - * use a relative threshold instead of an absolute one. */ - bool relative_pca; - /* How many frames before and after the current center frame are included. */ - int neighbor_frames; - /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ - bool clamp_input; - - /** OIDN/Optix Denoiser. */ - - /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */ - DenoiserInput input_passes; - - DenoiseParams() - { - use = false; - store_passes = false; - - type = DENOISER_NLM; - - radius = 8; - strength = 0.5f; - feature_strength = 0.5f; - relative_pca = false; - neighbor_frames = 2; - clamp_input = true; - - /* Default to color + albedo only, since normal input does not always have the desired effect - * when denoising with OptiX. */ - input_passes = DENOISER_INPUT_RGB_ALBEDO; - - start_sample = 0; - } - - /* Test if a denoising task needs to run, also to prefilter passes for the native - * denoiser when we are not applying denoising to the combined image. */ - bool need_denoising_task() const - { - return (use || (store_passes && type == DENOISER_NLM)); - } -}; - -class AdaptiveSampling { - public: - AdaptiveSampling(); - - int align_samples(int sample, int num_samples) const; - bool need_filter(int sample) const; - - bool use; - int adaptive_step; - int min_samples; -}; - -class DeviceTask { - public: - typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type; - Type type; - - int x, y, w, h; - device_ptr rgba_byte; - device_ptr rgba_half; - device_ptr buffer; - int sample; - int num_samples; - int offset, stride; - - device_ptr shader_input; - device_ptr shader_output; - int shader_eval_type; - int shader_filter; - int shader_x, shader_w; - - RenderBuffers *buffers; - - explicit DeviceTask(Type type = RENDER); - - int get_subtask_count(int num, int max_size = 0) const; - void split(list<DeviceTask> &tasks, int num, int max_size = 0) const; - - void update_progress(RenderTile *rtile, int pixel_samples = -1); - - function<bool(Device *device, RenderTile &, uint)> acquire_tile; - function<void(long, int)> update_progress_sample; - function<void(RenderTile &)> update_tile_sample; - function<void(RenderTile &)> release_tile; - function<bool()> get_cancel; - function<bool()> get_tile_stolen; - function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles; - function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles; - - uint tile_types; - DenoiseParams denoising; - bool denoising_from_render; - vector<int> denoising_frames; - - int pass_stride; - int frame_stride; - int target_pass_stride; - int pass_denoising_data; - int pass_denoising_clean; - - bool need_finish_queue; - bool integrator_branched; - AdaptiveSampling adaptive_sampling; - - protected: - double last_update_time; -}; - -CCL_NAMESPACE_END - -#endif /* __DEVICE_TASK_H__ */ diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp index 5112fc152e5..678276ed025 100644 --- a/intern/cycles/device/device_dummy.cpp +++ b/intern/cycles/device/dummy/device.cpp @@ -14,8 +14,10 @@ * limitations under the License. */ +#include "device/dummy/device.h" + #include "device/device.h" -#include "device/device_intern.h" +#include "device/device_queue.h" CCL_NAMESPACE_BEGIN @@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN class DummyDevice : public Device { public: - DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_) + DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_) { error_msg = info.error_msg; } @@ -61,23 +63,11 @@ class DummyDevice : public Device { virtual void const_copy_to(const char *, void *, size_t) override { } - - virtual void task_add(DeviceTask &) override - { - } - - virtual void task_wait() override - { - } - - virtual void task_cancel() override - { - } }; -Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) +Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { - return new DummyDevice(info, stats, profiler, background); + return new DummyDevice(info, stats, profiler); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/device/dummy/device.h index 8afaa686e28..832a9568129 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl +++ b/intern/cycles/device/dummy/device.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,18 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_do_volume.h" +#pragma once -#define KERNEL_NAME do_volume -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME +#include "util/util_string.h" +#include "util/util_vector.h" +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp new file mode 100644 index 00000000000..6dbcce2d9a5 --- /dev/null +++ b/intern/cycles/device/multi/device.cpp @@ -0,0 +1,423 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/multi/device.h" + +#include <sstream> +#include <stdlib.h> + +#include "bvh/bvh_multi.h" + +#include "device/device.h" +#include "device/device_queue.h" + +#include "render/buffers.h" +#include "render/geometry.h" + +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +class MultiDevice : public Device { + public: + struct SubDevice { + Stats stats; + Device *device; + map<device_ptr, device_ptr> ptr_map; + int peer_island_index = -1; + }; + + list<SubDevice> devices; + device_ptr unique_key; + vector<vector<SubDevice *>> peer_islands; + + MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), unique_key(1) + { + foreach (const DeviceInfo &subinfo, info.multi_devices) { + /* Always add CPU devices at the back since GPU devices can change + * host memory pointers, which CPU uses as device pointer. */ + SubDevice *sub; + if (subinfo.type == DEVICE_CPU) { + devices.emplace_back(); + sub = &devices.back(); + } + else { + devices.emplace_front(); + sub = &devices.front(); + } + + /* The pointer to 'sub->stats' will stay valid even after new devices + * are added, since 'devices' is a linked list. */ + sub->device = Device::create(subinfo, sub->stats, profiler); + } + + /* Build a list of peer islands for the available render devices */ + foreach (SubDevice &sub, devices) { + /* First ensure that every device is in at least once peer island */ + if (sub.peer_island_index < 0) { + peer_islands.emplace_back(); + sub.peer_island_index = (int)peer_islands.size() - 1; + peer_islands[sub.peer_island_index].push_back(&sub); + } + + if (!info.has_peer_memory) { + continue; + } + + /* Second check peer access between devices and fill up the islands accordingly */ + foreach (SubDevice &peer_sub, devices) { + if (peer_sub.peer_island_index < 0 && + peer_sub.device->info.type == sub.device->info.type && + peer_sub.device->check_peer_access(sub.device)) { + peer_sub.peer_island_index = sub.peer_island_index; + peer_islands[sub.peer_island_index].push_back(&peer_sub); + } + } + } + } + + ~MultiDevice() + { + foreach (SubDevice &sub, devices) + delete sub.device; + } + + const string &error_message() override + { + error_msg.clear(); + + foreach (SubDevice &sub, devices) + error_msg += sub.device->error_message(); + + return error_msg; + } + + virtual bool show_samples() const override + { + if (devices.size() > 1) { + return false; + } + return devices.front().device->show_samples(); + } + + virtual BVHLayoutMask get_bvh_layout_mask() const override + { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; + BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE; + foreach (const SubDevice &sub_device, devices) { + BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(); + bvh_layout_mask &= device_bvh_layout_mask; + bvh_layout_mask_all |= device_bvh_layout_mask; + } + + /* With multiple OptiX devices, every device needs its own acceleration structure */ + if (bvh_layout_mask == BVH_LAYOUT_OPTIX) { + return BVH_LAYOUT_MULTI_OPTIX; + } + + /* When devices do not share a common BVH layout, fall back to creating one for each */ + const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE); + if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) { + return BVH_LAYOUT_MULTI_OPTIX_EMBREE; + } + + return bvh_layout_mask; + } + + bool load_kernels(const uint kernel_features) override + { + foreach (SubDevice &sub, devices) + if (!sub.device->load_kernels(kernel_features)) + return false; + + return true; + } + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override + { + /* Try to build and share a single acceleration structure, if possible */ + if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) { + devices.back().device->build_bvh(bvh, progress, refit); + return; + } + + assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX || + bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE); + + BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh); + bvh_multi->sub_bvhs.resize(devices.size()); + + vector<BVHMulti *> geom_bvhs; + geom_bvhs.reserve(bvh->geometry.size()); + foreach (Geometry *geom, bvh->geometry) { + geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh)); + } + + /* Broadcast acceleration structure build to all render devices */ + size_t i = 0; + foreach (SubDevice &sub, devices) { + /* Change geometry BVH pointers to the sub BVH */ + for (size_t k = 0; k < bvh->geometry.size(); ++k) { + bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i]; + } + + if (!bvh_multi->sub_bvhs[i]) { + BVHParams params = bvh->params; + if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX) + params.bvh_layout = BVH_LAYOUT_OPTIX; + else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) + params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX : + BVH_LAYOUT_EMBREE; + + /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree + * (since they are put into the top level directly, see bvh_embree.cpp) */ + if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE && + !bvh->geometry[0]->is_instanced()) { + i++; + continue; + } + + bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device); + } + + sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit); + i++; + } + + /* Change geometry BVH pointers back to the multi BVH. */ + for (size_t k = 0; k < bvh->geometry.size(); ++k) { + bvh->geometry[k]->bvh = geom_bvhs[k]; + } + } + + virtual void *get_cpu_osl_memory() override + { + if (devices.size() > 1) { + return NULL; + } + return devices.front().device->get_cpu_osl_memory(); + } + + bool is_resident(device_ptr key, Device *sub_device) override + { + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) { + return find_matching_mem_device(key, sub)->device == sub_device; + } + } + return false; + } + + SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub) + { + assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end())); + + /* Get the memory owner of this key (first try current device, then peer devices) */ + SubDevice *owner_sub = ⊂ + if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) { + foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) { + if (island_sub != owner_sub && + island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) { + owner_sub = island_sub; + } + } + } + return owner_sub; + } + + SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island) + { + assert(!island.empty()); + + /* Get the memory owner of this key or the device with the lowest memory usage when new */ + SubDevice *owner_sub = island.front(); + foreach (SubDevice *island_sub, island) { + if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) : + (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) { + owner_sub = island_sub; + } + } + return owner_sub; + } + + inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub) + { + return find_matching_mem_device(key, sub)->ptr_map[key]; + } + + void mem_alloc(device_memory &mem) override + { + device_ptr key = unique_key++; + + assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY); + /* The remaining memory types can be distributed across devices */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(key, island); + mem.device = owner_sub->device; + mem.device_pointer = 0; + mem.device_size = 0; + + owner_sub->device->mem_alloc(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size); + } + + void mem_copy_to(device_memory &mem) override + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + /* The tile buffers are allocated on each device (see below), so copy to all of them */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_copy_to(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + + if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { + /* Need to create texture objects and update pointer in kernel globals on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_copy_to(mem); + } + } + } + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + { + device_ptr key = mem.device_pointer; + int i = 0, sub_h = h / devices.size(); + + foreach (SubDevice &sub, devices) { + int sy = y + i * sub_h; + int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; + + SubDevice *owner_sub = find_matching_mem_device(key, sub); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; + + owner_sub->device->mem_copy_from(mem, sy, w, sh, elem); + i++; + } + + mem.device = this; + mem.device_pointer = key; + } + + void mem_zero(device_memory &mem) override + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_zero(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_free(device_memory &mem) override + { + device_ptr key = mem.device_pointer; + size_t existing_size = mem.device_size; + + /* Free memory that was allocated for all devices (see above) on each device */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_matching_mem_device(key, *island.front()); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; + mem.device_size = existing_size; + + owner_sub->device->mem_free(mem); + owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key)); + + if (mem.type == MEM_TEXTURE) { + /* Free texture objects on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_free(mem); + } + } + } + } + + mem.device = this; + mem.device_pointer = 0; + mem.device_size = 0; + stats.mem_free(existing_size); + } + + void const_copy_to(const char *name, void *host, size_t size) override + { + foreach (SubDevice &sub, devices) + sub.device->const_copy_to(name, host, size); + } + + int device_number(Device *sub_device) override + { + int i = 0; + + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) + return i; + i++; + } + + return -1; + } + + virtual void foreach_device(const function<void(Device *)> &callback) override + { + foreach (SubDevice &sub, devices) { + sub.device->foreach_device(callback); + } + } +}; + +Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ + return new MultiDevice(info, stats, profiler); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/device/multi/device.h index 192d01444ba..6e121014a1f 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl +++ b/intern/cycles/device/multi/device.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,18 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_indirect_background.h" +#pragma once -#define KERNEL_NAME indirect_background -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME +#include "util/util_string.h" +#include "util/util_vector.h" +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h deleted file mode 100644 index a65e764b0d4..00000000000 --- a/intern/cycles/device/opencl/device_opencl.h +++ /dev/null @@ -1,658 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/device.h" -# include "device/device_denoising.h" -# include "device/device_split_kernel.h" - -# include "util/util_map.h" -# include "util/util_param.h" -# include "util/util_string.h" -# include "util/util_task.h" - -# include "clew.h" - -# include "device/opencl/memory_manager.h" - -CCL_NAMESPACE_BEGIN - -/* Disable workarounds, seems to be working fine on latest drivers. */ -# define CYCLES_DISABLE_DRIVER_WORKAROUNDS - -/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */ -# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS -/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ -# undef clEnqueueNDRangeKernel -# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); - -# undef clEnqueueWriteBuffer -# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); - -# undef clEnqueueReadBuffer -# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); -# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ - -# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) - -struct OpenCLPlatformDevice { - OpenCLPlatformDevice(cl_platform_id platform_id, - const string &platform_name, - cl_device_id device_id, - cl_device_type device_type, - const string &device_name, - const string &hardware_id, - const string &device_extensions) - : platform_id(platform_id), - platform_name(platform_name), - device_id(device_id), - device_type(device_type), - device_name(device_name), - hardware_id(hardware_id), - device_extensions(device_extensions) - { - } - cl_platform_id platform_id; - string platform_name; - cl_device_id device_id; - cl_device_type device_type; - string device_name; - string hardware_id; - string device_extensions; -}; - -/* Contains all static OpenCL helper functions. */ -class OpenCLInfo { - public: - static cl_device_type device_type(); - static bool use_debug(); - static bool device_supported(const string &platform_name, const cl_device_id device_id); - static bool platform_version_check(cl_platform_id platform, string *error = NULL); - static bool device_version_check(cl_device_id device, string *error = NULL); - static bool get_device_version(cl_device_id device, - int *r_major, - int *r_minor, - string *error = NULL); - static string get_hardware_id(const string &platform_name, cl_device_id device_id); - static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices); - - /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ - - /* Platform information. */ - static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); - static cl_uint get_num_platforms(); - - static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL); - static vector<cl_platform_id> get_platforms(); - - static bool get_platform_name(cl_platform_id platform_id, string *platform_name); - static string get_platform_name(cl_platform_id platform_id); - - static bool get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error = NULL); - static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type); - - static bool get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int *error = NULL); - static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - /* Device information. */ - static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL); - - static string get_device_name(cl_device_id device_id); - - static bool get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int *error = NULL); - - static string get_device_extensions(cl_device_id device_id); - - static bool get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int *error = NULL); - static cl_device_type get_device_type(cl_device_id device_id); - - static bool get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int *error = NULL); - - static int mem_sub_ptr_alignment(cl_device_id device_id); - - /* Get somewhat more readable device name. - * Main difference is AMD OpenCL here which only gives code name - * for the regular device name. This will give more sane device - * name using some extensions. - */ - static string get_readable_device_name(cl_device_id device_id); -}; - -/* Thread safe cache for contexts and programs. - */ -class OpenCLCache { - struct Slot { - struct ProgramEntry { - ProgramEntry(); - ProgramEntry(const ProgramEntry &rhs); - ~ProgramEntry(); - cl_program program; - thread_mutex *mutex; - }; - - Slot(); - Slot(const Slot &rhs); - ~Slot(); - - thread_mutex *context_mutex; - cl_context context; - typedef map<ustring, ProgramEntry> EntryMap; - EntryMap programs; - }; - - /* key is combination of platform ID and device ID */ - typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; - - /* map of Slot objects */ - typedef map<PlatformDevicePair, Slot> CacheMap; - CacheMap cache; - - /* MD5 hash of the kernel source. */ - string kernel_md5; - - thread_mutex cache_lock; - thread_mutex kernel_md5_lock; - - /* lazy instantiate */ - static OpenCLCache &global_instance(); - - public: - enum ProgramName { - OCL_DEV_BASE_PROGRAM, - OCL_DEV_MEGAKERNEL_PROGRAM, - }; - - /* Lookup context in the cache. If this returns NULL, slot_locker - * will be holding a lock for the cache. slot_locker should refer to a - * default constructed thread_scoped_lock. */ - static cl_context get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock &slot_locker); - /* Same as above. */ - static cl_program get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock &slot_locker); - - /* Store context in the cache. You MUST have tried to get the item before storing to it. */ - static void store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock &slot_locker); - /* Same as above. */ - static void store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock &slot_locker); - - static string get_kernel_md5(); -}; - -# define opencl_device_assert(device, stmt) \ - { \ - cl_int err = stmt; \ -\ - if (err != CL_SUCCESS) { \ - string message = string_printf( \ - "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if ((device)->error_message() == "") { \ - (device)->set_error(message); \ - } \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } \ - (void)0 - -# define opencl_assert(stmt) \ - { \ - cl_int err = stmt; \ -\ - if (err != CL_SUCCESS) { \ - string message = string_printf( \ - "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if (error_msg == "") { \ - error_msg = message; \ - } \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } \ - (void)0 - -class OpenCLDevice : public Device { - public: - DedicatedTaskPool task_pool; - - /* Task pool for required kernels (base, AO kernels during foreground rendering) */ - TaskPool load_required_kernel_task_pool; - /* Task pool for optional kernels (feature kernels during foreground rendering) */ - TaskPool load_kernel_task_pool; - std::atomic<int> load_kernel_num_compiling; - - cl_context cxContext; - cl_command_queue cqCommandQueue; - cl_platform_id cpPlatform; - cl_device_id cdDevice; - cl_int ciErr; - int device_num; - - class OpenCLProgram { - public: - OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) - { - } - OpenCLProgram(OpenCLDevice *device, - const string &program_name, - const string &kernel_name, - const string &kernel_build_options, - bool use_stdout = true); - ~OpenCLProgram(); - - void add_kernel(ustring name); - - /* Try to load the program from device cache or disk */ - bool load(); - /* Compile the kernel (first separate, fail-back to local). */ - void compile(); - /* Create the OpenCL kernels after loading or compiling */ - void create_kernels(); - - bool is_loaded() const - { - return loaded; - } - const string &get_log() const - { - return log; - } - void report_error(); - - /* Wait until this kernel is available to be used - * It will return true when the kernel is available. - * It will return false when the kernel is not available - * or could not be loaded. */ - bool wait_for_availability(); - - cl_kernel operator()(); - cl_kernel operator()(ustring name); - - void release(); - - private: - bool build_kernel(const string *debug_src); - /* Build the program by calling the own process. - * This is required for multithreaded OpenCL compilation, since most Frameworks serialize - * build calls internally if they come from the same process. - * If that is not supported, this function just returns false. - */ - bool compile_separate(const string &clbin); - /* Build the program by calling OpenCL directly. */ - bool compile_kernel(const string *debug_src); - /* Loading and saving the program from/to disk. */ - bool load_binary(const string &clbin, const string *debug_src = NULL); - bool save_binary(const string &clbin); - - void add_log(const string &msg, bool is_debug); - void add_error(const string &msg); - - bool loaded; - bool needs_compiling; - - cl_program program; - OpenCLDevice *device; - - /* Used for the OpenCLCache key. */ - string program_name; - - string kernel_file, kernel_build_options, device_md5; - - bool use_stdout; - string log, error_msg; - string compile_output; - - map<ustring, cl_kernel> kernels; - }; - - /* Container for all types of split programs. */ - class OpenCLSplitPrograms { - public: - OpenCLDevice *device; - OpenCLProgram program_split; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_do_volume; - OpenCLProgram program_indirect_background; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_subsurface_scatter; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked_ao; - OpenCLProgram program_shadow_blocked_dl; - - OpenCLSplitPrograms(OpenCLDevice *device); - ~OpenCLSplitPrograms(); - - /* Load the kernels and put the created kernels in the given - * `programs` parameter. */ - void load_kernels(vector<OpenCLProgram *> &programs, - const DeviceRequestedFeatures &requested_features); - }; - - DeviceSplitKernel *split_kernel; - - OpenCLProgram base_program; - OpenCLProgram bake_program; - OpenCLProgram displace_program; - OpenCLProgram background_program; - OpenCLProgram denoising_program; - - OpenCLSplitPrograms kernel_programs; - - typedef map<string, device_vector<uchar> *> ConstMemMap; - typedef map<string, device_ptr> MemMap; - - ConstMemMap const_mem_map; - MemMap mem_map; - - bool device_initialized; - string platform_name; - string device_name; - - bool opencl_error(cl_int err); - void opencl_error(const string &message); - void opencl_assert_err(cl_int err, const char *where); - - OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); - ~OpenCLDevice(); - - static void CL_CALLBACK context_notify_callback(const char *err_info, - const void * /*private_info*/, - size_t /*cb*/, - void *user_data); - - bool opencl_version_check(); - OpenCLSplitPrograms *get_split_programs(); - - string device_md5_hash(string kernel_custom_build_options = ""); - bool load_kernels(const DeviceRequestedFeatures &requested_features); - void load_required_kernels(const DeviceRequestedFeatures &requested_features); - - bool wait_for_availability(const DeviceRequestedFeatures &requested_features); - DeviceKernelStatus get_active_kernel_switch_state(); - - /* Get the name of the opencl program for the given kernel */ - const string get_opencl_program_name(const string &kernel_name); - /* Get the program file name to compile (*.cl) for the given kernel */ - const string get_opencl_program_filename(const string &kernel_name); - string get_build_options(const DeviceRequestedFeatures &requested_features, - const string &opencl_program_name); - /* Enable the default features to reduce recompilation events */ - void enable_default_features(DeviceRequestedFeatures &features); - - void mem_alloc(device_memory &mem); - void mem_copy_to(device_memory &mem); - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); - void mem_zero(device_memory &mem); - void mem_free(device_memory &mem); - - int mem_sub_ptr_alignment(); - - void const_copy_to(const char *name, void *host, size_t size); - void global_alloc(device_memory &mem); - void global_free(device_memory &mem); - void tex_alloc(device_texture &mem); - void tex_free(device_texture &mem); - - size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, - size_t w, - size_t h, - bool x_workgroups = false, - size_t max_workgroup_size = -1); - void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); - - void film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half); - void shader(DeviceTask &task); - void update_adaptive(DeviceTask &task, RenderTile &tile, int sample); - void bake(DeviceTask &task, RenderTile &tile); - - void denoise(RenderTile &tile, DenoisingTask &denoising); - - int get_split_task_count(DeviceTask & /*task*/) - { - return 1; - } - - void task_add(DeviceTask &task) - { - task_pool.push([=] { - DeviceTask task_copy = task; - thread_run(task_copy); - }); - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - void thread_run(DeviceTask &task); - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - virtual bool show_samples() const - { - return true; - } - - protected: - string kernel_build_options(const string *debug_src = NULL); - - void mem_zero_kernel(device_ptr ptr, size_t size); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - bool denoising_construct_transform(DenoisingTask *task); - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task); - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - bool denoising_write_feature(int to_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size); - void mem_free_sub_ptr(device_ptr ptr); - - class ArgumentWrapper { - public: - ArgumentWrapper() : size(0), pointer(NULL) - { - } - - ArgumentWrapper(device_memory &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_vector<T> &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_only_memory<T> &argument) - : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) - { - } - template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument) - { - } - - ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value) - { - } - - ArgumentWrapper(float argument) - : size(sizeof(float)), float_value(argument), pointer(&float_value) - { - } - - size_t size; - int int_value; - float float_value; - void *pointer; - }; - - /* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ - int kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper &arg1 = ArgumentWrapper(), - const ArgumentWrapper &arg2 = ArgumentWrapper(), - const ArgumentWrapper &arg3 = ArgumentWrapper(), - const ArgumentWrapper &arg4 = ArgumentWrapper(), - const ArgumentWrapper &arg5 = ArgumentWrapper(), - const ArgumentWrapper &arg6 = ArgumentWrapper(), - const ArgumentWrapper &arg7 = ArgumentWrapper(), - const ArgumentWrapper &arg8 = ArgumentWrapper(), - const ArgumentWrapper &arg9 = ArgumentWrapper(), - const ArgumentWrapper &arg10 = ArgumentWrapper(), - const ArgumentWrapper &arg11 = ArgumentWrapper(), - const ArgumentWrapper &arg12 = ArgumentWrapper(), - const ArgumentWrapper &arg13 = ArgumentWrapper(), - const ArgumentWrapper &arg14 = ArgumentWrapper(), - const ArgumentWrapper &arg15 = ArgumentWrapper(), - const ArgumentWrapper &arg16 = ArgumentWrapper(), - const ArgumentWrapper &arg17 = ArgumentWrapper(), - const ArgumentWrapper &arg18 = ArgumentWrapper(), - const ArgumentWrapper &arg19 = ArgumentWrapper(), - const ArgumentWrapper &arg20 = ArgumentWrapper(), - const ArgumentWrapper &arg21 = ArgumentWrapper(), - const ArgumentWrapper &arg22 = ArgumentWrapper(), - const ArgumentWrapper &arg23 = ArgumentWrapper(), - const ArgumentWrapper &arg24 = ArgumentWrapper(), - const ArgumentWrapper &arg25 = ArgumentWrapper(), - const ArgumentWrapper &arg26 = ArgumentWrapper(), - const ArgumentWrapper &arg27 = ArgumentWrapper(), - const ArgumentWrapper &arg28 = ArgumentWrapper(), - const ArgumentWrapper &arg29 = ArgumentWrapper(), - const ArgumentWrapper &arg30 = ArgumentWrapper(), - const ArgumentWrapper &arg31 = ArgumentWrapper(), - const ArgumentWrapper &arg32 = ArgumentWrapper(), - const ArgumentWrapper &arg33 = ArgumentWrapper()); - - void release_kernel_safe(cl_kernel kernel); - void release_mem_object_safe(cl_mem mem); - void release_program_safe(cl_program program); - - /* ** Those guys are for working around some compiler-specific bugs ** */ - - cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker); - - void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker); - - private: - MemoryManager memory_manager; - friend class MemoryManager; - - static_assert_align(TextureInfo, 16); - device_vector<TextureInfo> texture_info; - - typedef map<string, device_memory *> TexturesMap; - TexturesMap textures; - - bool textures_need_update; - - protected: - void flush_texture_buffers(); - - friend class OpenCLSplitKernel; - friend class OpenCLSplitKernelFunction; -}; - -Device *opencl_create_split_device(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background); - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp deleted file mode 100644 index 31a2265700c..00000000000 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ /dev/null @@ -1,2113 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/opencl/device_opencl.h" - -# include "kernel/kernel_types.h" -# include "kernel/split/kernel_split_data_types.h" - -# include "util/util_algorithm.h" -# include "util/util_debug.h" -# include "util/util_foreach.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_time.h" - -CCL_NAMESPACE_BEGIN - -struct texture_slot_t { - texture_slot_t(const string &name, int slot) : name(name), slot(slot) - { - } - string name; - int slot; -}; - -static const string NON_SPLIT_KERNELS = - "denoising " - "base " - "background " - "displace "; - -static const string SPLIT_BUNDLE_KERNELS = - "data_init " - "path_init " - "state_buffer_size " - "scene_intersect " - "queue_enqueue " - "shader_setup " - "shader_sort " - "enqueue_inactive " - "next_iteration_setup " - "indirect_subsurface " - "buffer_update " - "adaptive_stopping " - "adaptive_filter_x " - "adaptive_filter_y " - "adaptive_adjust_samples"; - -const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) -{ - if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { - return kernel_name; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "split_bundle"; - } - else { - return "split_" + kernel_name; - } -} - -const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name) -{ - if (kernel_name == "denoising") { - return "filter.cl"; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "kernel_split_bundle.cl"; - } - else { - return "kernel_" + kernel_name + ".cl"; - } -} - -/* Enable features that we always want to compile to reduce recompilation events */ -void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features) -{ - features.use_transparent = true; - features.use_shadow_tricks = true; - features.use_principled = true; - features.use_denoising = true; - - if (!background) { - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_hair = true; - features.use_subsurface = true; - features.use_camera_motion = false; - features.use_object_motion = false; - } -} - -string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features, - const string &opencl_program_name) -{ - /* first check for non-split kernel programs */ - if (opencl_program_name == "base" || opencl_program_name == "denoising") { - return ""; - } - else if (opencl_program_name == "bake") { - /* Note: get_build_options for bake is only requested when baking is enabled. - * displace and background are always requested. - * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_hair = true; - features.use_subsurface = true; - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "displace") { - /* As displacement does not use any nodes from the Shading group (eg BSDF). - * We disable all features that are related to shading. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_baking = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_subsurface = false; - features.use_volume = false; - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_denoising = false; - features.use_principled = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "background") { - /* Background uses Background shading - * It is save to disable shadow features, subsurface and volumetric. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_baking = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_denoising = false; - /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. - * Perhaps we should remove them in UI as it does not make any sense when - * rendering background. */ - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_subsurface = false; - features.use_volume = false; - features.use_shader_raytrace = false; - features.use_patch_evaluation = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - - string build_options = "-D__SPLIT_KERNEL__ "; - /* Set compute device build option. */ - cl_device_type device_type; - OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); - assert(this->ciErr == CL_SUCCESS); - if (device_type == CL_DEVICE_TYPE_GPU) { - build_options += "-D__COMPUTE_DEVICE_GPU__ "; - } - - DeviceRequestedFeatures nofeatures; - enable_default_features(nofeatures); - - /* Add program specific optimized compile directives */ - if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { - build_options += nofeatures.get_build_options(); - } - else { - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - - /* Always turn off baking at this point. Baking is only useful when building the bake kernel. - * this also makes sure that the kernels that are build during baking can be reused - * when not doing any baking. */ - features.use_baking = false; - - /* Do not vary on shaders when program doesn't do any shading. - * We have bundled them in a single program. */ - if (opencl_program_name == "split_bundle") { - features.max_nodes_group = 0; - features.nodes_features = 0; - features.use_shader_raytrace = false; - } - - /* No specific settings, just add the regular ones */ - build_options += features.get_build_options(); - } - - return build_options; -} - -OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_) -{ - device = device_; -} - -OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms() -{ - program_split.release(); - program_lamp_emission.release(); - program_do_volume.release(); - program_indirect_background.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_subsurface_scatter.release(); - program_direct_lighting.release(); - program_shadow_blocked_ao.release(); - program_shadow_blocked_dl.release(); -} - -void OpenCLDevice::OpenCLSplitPrograms::load_kernels( - vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features) -{ - if (!requested_features.use_baking) { -# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \ - program_split.add_kernel(ustring("path_trace_" #kernel_name)); -# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ - const string program_name_##kernel_name = "split_" #kernel_name; \ - program_##kernel_name = OpenCLDevice::OpenCLProgram( \ - device, \ - program_name_##kernel_name, \ - "kernel_" #kernel_name ".cl", \ - device->get_build_options(requested_features, program_name_##kernel_name)); \ - program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \ - programs.push_back(&program_##kernel_name); - - /* Ordered with most complex kernels first, to reduce overall compile time. */ - ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); - ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_PROGRAM(indirect_background); - if (requested_features.use_volume) { - ADD_SPLIT_KERNEL_PROGRAM(do_volume); - } - ADD_SPLIT_KERNEL_PROGRAM(shader_eval); - ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); - - /* Quick kernels bundled in a single program to reduce overhead of starting - * Blender processes. */ - program_split = OpenCLDevice::OpenCLProgram( - device, - "split_bundle", - "kernel_split_bundle.cl", - device->get_build_options(requested_features, "split_bundle")); - - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples); - programs.push_back(&program_split); - -# undef ADD_SPLIT_KERNEL_PROGRAM -# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM - } -} - -namespace { - -/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ -typedef struct KernelGlobalsDummy { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; - -# define KERNEL_TEX(type, name) TextureInfo name; -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - SplitData split_data; - SplitParams split_param_data; -} KernelGlobalsDummy; - -} // namespace - -struct CachedSplitMemory { - int id; - device_memory *split_data; - device_memory *ray_state; - device_memory *queue_index; - device_memory *use_queues_flag; - device_memory *work_pools; - device_ptr *buffer; -}; - -class OpenCLSplitKernelFunction : public SplitKernelFunction { - public: - OpenCLDevice *device; - OpenCLDevice::OpenCLProgram program; - CachedSplitMemory &cached_memory; - int cached_id; - - OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory) - : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1) - { - } - - ~OpenCLSplitKernelFunction() - { - program.release(); - } - - virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) - { - if (cached_id != cached_memory.id) { - cl_uint start_arg_index = device->kernel_set_args( - program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state); - - device->set_kernel_arg_buffers(program(), &start_arg_index); - - start_arg_index += device->kernel_set_args(program(), - start_arg_index, - *cached_memory.queue_index, - *cached_memory.use_queues_flag, - *cached_memory.work_pools, - *cached_memory.buffer); - - cached_id = cached_memory.id; - } - - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - program(), - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - return true; - } -}; - -class OpenCLSplitKernel : public DeviceSplitKernel { - OpenCLDevice *device; - CachedSplitMemory cached_memory; - - public: - explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) - { - } - - virtual SplitKernelFunction *get_split_kernel_function( - const string &kernel_name, const DeviceRequestedFeatures &requested_features) - { - OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory); - - const string program_name = device->get_opencl_program_name(kernel_name); - kernel->program = OpenCLDevice::OpenCLProgram( - device, - program_name, - device->get_opencl_program_filename(kernel_name), - device->get_build_options(requested_features, program_name)); - - kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); - kernel->program.load(); - - if (!kernel->program.is_loaded()) { - delete kernel; - return NULL; - } - - return kernel; - } - - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads) - { - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_state_buffer_size = programs->program_split( - ustring("path_trace_state_buffer_size")); - device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); - - size_t global_size = 64; - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_state_buffer_size, - 1, - NULL, - &global_size, - NULL, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return 0; - } - - return size; - } - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) - { - cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; - - /* Set the range of samples to be processed for every ray in - * path-regeneration logic. - */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); - - cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, - 0, - kernel_globals, - kernel_data, - split_data, - num_global_elements, - ray_state); - - device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); - - start_arg_index += device->kernel_set_args(kernel_data_init, - start_arg_index, - start_sample, - end_sample, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - queue_index, - dQueue_size, - use_queues_flag, - work_pool_wgs, - rtile.num_samples, - rtile.buffer); - - /* Enqueue ckPathTraceKernel_data_init kernel. */ - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_data_init, - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if (device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - cached_memory.split_data = &split_data; - cached_memory.ray_state = &ray_state; - cached_memory.queue_index = &queue_index; - cached_memory.use_queues_flag = &use_queues_flag; - cached_memory.work_pools = &work_pool_wgs; - cached_memory.buffer = &rtile.buffer; - cached_memory.id++; - - return true; - } - - virtual int2 split_kernel_local_size() - { - return make_int2(64, 1); - } - - virtual int2 split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask & /*task*/) - { - cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); - /* Use small global size on CPU devices as it seems to be much faster. */ - if (type == CL_DEVICE_TYPE_CPU) { - VLOG(1) << "Global size: (64, 64)."; - return make_int2(64, 64); - } - - cl_ulong max_buffer_size; - clGetDeviceInfo( - device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if (DebugFlags().opencl.mem_limit) { - max_buffer_size = min(max_buffer_size, - cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); - } - - VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) - << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; - - /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ - max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); - int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), - (int)sqrt(num_elements)); - - if (device->info.description.find("Intel") != string::npos) { - global_size = make_int2(min(512, global_size.x), min(512, global_size.y)); - } - - VLOG(1) << "Global size: " << global_size << "."; - return global_size; - } -}; - -bool OpenCLDevice::opencl_error(cl_int err) -{ - if (err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - return true; - } - - return false; -} - -void OpenCLDevice::opencl_error(const string &message) -{ - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -} - -void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) -{ - if (err != CL_SUCCESS) { - string message = string_printf( - "OpenCL error (%d): %s in %s", err, clewErrorString(err), where); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -# ifndef NDEBUG - abort(); -# endif - } -} - -OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) - : Device(info, stats, profiler, background), - load_kernel_num_compiling(0), - kernel_programs(this), - memory_manager(this), - texture_info(this, "__texture_info", MEM_GLOBAL) -{ - cpPlatform = NULL; - cdDevice = NULL; - cxContext = NULL; - cqCommandQueue = NULL; - device_initialized = false; - textures_need_update = true; - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if (usable_devices.size() == 0) { - opencl_error("OpenCL: no devices found."); - return; - } - assert(info.num < usable_devices.size()); - OpenCLPlatformDevice &platform_device = usable_devices[info.num]; - device_num = info.num; - cpPlatform = platform_device.platform_id; - cdDevice = platform_device.device_id; - platform_name = platform_device.platform_name; - device_name = platform_device.device_name; - VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " - << device_name << "."; - - { - /* try to use cached context */ - thread_scoped_lock cache_locker; - cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); - - if (cxContext == NULL) { - /* create context properties array to specify platform */ - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0}; - - /* create context */ - cxContext = clCreateContext( - context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr); - - if (opencl_error(ciErr)) { - opencl_error("OpenCL: clCreateContext failed"); - return; - } - - /* cache it */ - OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); - } - } - - cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if (opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating command queue"); - return; - } - - /* Allocate this right away so that texture_info - * is placed at offset 0 in the device memory buffers. */ - texture_info.resize(1); - memory_manager.alloc("texture_info", texture_info); - - device_initialized = true; - - split_kernel = new OpenCLSplitKernel(this); -} - -OpenCLDevice::~OpenCLDevice() -{ - task_pool.cancel(); - load_required_kernel_task_pool.cancel(); - load_kernel_task_pool.cancel(); - - memory_manager.free(); - - ConstMemMap::iterator mt; - for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - delete mt->second; - } - - base_program.release(); - bake_program.release(); - displace_program.release(); - background_program.release(); - denoising_program.release(); - - if (cqCommandQueue) - clReleaseCommandQueue(cqCommandQueue); - if (cxContext) - clReleaseContext(cxContext); - - delete split_kernel; -} - -void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, - const void * /*private_info*/, - size_t /*cb*/, - void *user_data) -{ - string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); - fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); -} - -bool OpenCLDevice::opencl_version_check() -{ - string error; - if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) { - opencl_error(error); - return false; - } - if (!OpenCLInfo::device_version_check(cdDevice, &error)) { - opencl_error(error); - return false; - } - return true; -} - -string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) -{ - MD5Hash md5; - char version[256], driver[256], name[256], vendor[256]; - - clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); - clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); - - md5.append((uint8_t *)vendor, strlen(vendor)); - md5.append((uint8_t *)version, strlen(version)); - md5.append((uint8_t *)name, strlen(name)); - md5.append((uint8_t *)driver, strlen(driver)); - - string options = kernel_build_options(); - options += kernel_custom_build_options; - md5.append((uint8_t *)options.c_str(), options.size()); - - return md5.get_hex(); -} - -bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features) -{ - VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << "."; - /* Verify if device was initialized. */ - if (!device_initialized) { - fprintf(stderr, "OpenCL: failed to initialize device.\n"); - return false; - } - - /* Verify we have right opencl version. */ - if (!opencl_version_check()) - return false; - - load_required_kernels(requested_features); - - vector<OpenCLProgram *> programs; - kernel_programs.load_kernels(programs, requested_features); - - if (!requested_features.use_baking && requested_features.use_denoising) { - denoising_program = OpenCLProgram( - this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); - denoising_program.add_kernel(ustring("filter_divide_shadow")); - denoising_program.add_kernel(ustring("filter_get_feature")); - denoising_program.add_kernel(ustring("filter_write_feature")); - denoising_program.add_kernel(ustring("filter_detect_outliers")); - denoising_program.add_kernel(ustring("filter_combine_halves")); - denoising_program.add_kernel(ustring("filter_construct_transform")); - denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); - denoising_program.add_kernel(ustring("filter_nlm_blur")); - denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); - denoising_program.add_kernel(ustring("filter_nlm_update_output")); - denoising_program.add_kernel(ustring("filter_nlm_normalize")); - denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); - denoising_program.add_kernel(ustring("filter_finalize")); - programs.push_back(&denoising_program); - } - - load_required_kernel_task_pool.wait_work(); - - /* Parallel compilation of Cycles kernels, this launches multiple - * processes to workaround OpenCL frameworks serializing the calls - * internally within a single process. */ - foreach (OpenCLProgram *program, programs) { - if (!program->load()) { - load_kernel_num_compiling++; - load_kernel_task_pool.push([=] { - program->compile(); - load_kernel_num_compiling--; - }); - } - } - return true; -} - -void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features) -{ - vector<OpenCLProgram *> programs; - base_program = OpenCLProgram( - this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); - base_program.add_kernel(ustring("convert_to_byte")); - base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("zero_buffer")); - programs.push_back(&base_program); - - if (requested_features.use_true_displacement) { - displace_program = OpenCLProgram( - this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); - displace_program.add_kernel(ustring("displace")); - programs.push_back(&displace_program); - } - - if (requested_features.use_background_light) { - background_program = OpenCLProgram(this, - "background", - "kernel_background.cl", - get_build_options(requested_features, "background")); - background_program.add_kernel(ustring("background")); - programs.push_back(&background_program); - } - - if (requested_features.use_baking) { - bake_program = OpenCLProgram( - this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); - bake_program.add_kernel(ustring("bake")); - programs.push_back(&bake_program); - } - - foreach (OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } -} - -bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features) -{ - if (requested_features.use_baking) { - /* For baking, kernels have already been loaded in load_required_kernels(). */ - return true; - } - - load_kernel_task_pool.wait_work(); - return split_kernel->load_kernels(requested_features); -} - -OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs() -{ - return &kernel_programs; -} - -DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() -{ - return DEVICE_KERNEL_USING_FEATURE_KERNEL; -} - -void OpenCLDevice::mem_alloc(device_memory &mem) -{ - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - size_t size = mem.memory_size(); - - /* check there is enough memory available for the allocation */ - cl_ulong max_alloc_size = 0; - clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); - - if (DebugFlags().opencl.mem_limit) { - max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); - } - - if (size > max_alloc_size) { - string error = "Scene too complex to fit in available memory."; - if (mem.name != NULL) { - error += string_printf(" (allocating buffer %s failed.)", mem.name); - } - set_error(error); - - return; - } - - cl_mem_flags mem_flag; - void *mem_ptr = NULL; - - if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - /* Zero-size allocation might be invoked by render, but not really - * supported by OpenCL. Using NULL as device pointer also doesn't really - * work for some reason, so for the time being we'll use special case - * will null_mem buffer. - */ - if (size != 0) { - mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - } - else { - mem.device_pointer = 0; - } - - stats.mem_alloc(size); - mem.device_size = size; -} - -void OpenCLDevice::mem_copy_to(device_memory &mem) -{ - if (mem.type == MEM_GLOBAL) { - global_free(mem); - global_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - tex_alloc((device_texture &)mem); - } - else { - if (!mem.device_pointer) { - mem_alloc(mem); - } - - /* this is blocking */ - size_t size = mem.memory_size(); - if (size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - mem.host_pointer, - 0, - NULL, - NULL)); - } - } -} - -void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) -{ - size_t offset = elem * y * w; - size_t size = elem * w * h; - assert(size != 0); - opencl_assert(clEnqueueReadBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - offset, - size, - (uchar *)mem.host_pointer + offset, - 0, - NULL, - NULL)); -} - -void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) -{ - base_program.wait_for_availability(); - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; - - cl_mem d_buffer = CL_MEM_PTR(mem); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; - - while (d_offset < size) { - d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset); - - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - - ciErr = clEnqueueNDRangeKernel( - cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - - d_offset += d_size; - } -} - -void OpenCLDevice::mem_zero(device_memory &mem) -{ - if (!mem.device_pointer) { - mem_alloc(mem); - } - - if (mem.device_pointer) { - if (base_program.is_loaded()) { - mem_zero_kernel(mem.device_pointer, mem.memory_size()); - } - - if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if (!base_program.is_loaded()) { - void *zero = mem.host_pointer; - - if (!mem.host_pointer) { - zero = util_aligned_malloc(mem.memory_size(), 16); - memset(zero, 0, mem.memory_size()); - } - - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - mem.memory_size(), - zero, - 0, - NULL, - NULL)); - - if (!mem.host_pointer) { - util_aligned_free(zero); - } - } - } -} - -void OpenCLDevice::mem_free(device_memory &mem) -{ - if (mem.type == MEM_GLOBAL) { - global_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free((device_texture &)mem); - } - else { - if (mem.device_pointer) { - if (mem.device_pointer != 0) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); - } - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } -} - -int OpenCLDevice::mem_sub_ptr_alignment() -{ - return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); -} - -device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size) -{ - cl_mem_flags mem_flag; - if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - cl_buffer_region info; - info.origin = mem.memory_elements_size(offset); - info.size = mem.memory_elements_size(size); - - device_ptr sub_buf = (device_ptr)clCreateSubBuffer( - CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr); - opencl_assert_err(ciErr, "clCreateSubBuffer"); - return sub_buf; -} - -void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) -{ - if (device_pointer != 0) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); - } -} - -void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) -{ - ConstMemMap::iterator i = const_mem_map.find(name); - device_vector<uchar> *data; - - if (i == const_mem_map.end()) { - data = new device_vector<uchar>(this, name, MEM_READ_ONLY); - data->alloc(size); - const_mem_map.insert(ConstMemMap::value_type(name, data)); - } - else { - data = i->second; - } - - memcpy(data->data(), host, size); - data->copy_to_device(); -} - -void OpenCLDevice::global_alloc(device_memory &mem) -{ - VLOG(1) << "Global memory allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its - * unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; -} - -void OpenCLDevice::global_free(device_memory &mem) -{ - if (mem.device_pointer) { - mem.device_pointer = 0; - - if (memory_manager.free(mem)) { - textures_need_update = true; - } - - foreach (TexturesMap::value_type &value, textures) { - if (value.second == &mem) { - textures.erase(value.first); - break; - } - } - } -} - -void OpenCLDevice::tex_alloc(device_texture &mem) -{ - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its - * unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; -} - -void OpenCLDevice::tex_free(device_texture &mem) -{ - global_free(mem); -} - -size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) -{ - int r = global_size % group_size; - return global_size + ((r == 0) ? 0 : group_size - r); -} - -void OpenCLDevice::enqueue_kernel( - cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) -{ - size_t workgroup_size, max_work_items[3]; - - clGetKernelWorkGroupInfo( - kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); - clGetDeviceInfo( - cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL); - - if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { - workgroup_size = max_workgroup_size; - } - - /* Try to divide evenly over 2 dimensions. */ - size_t local_size[2]; - if (x_workgroups) { - local_size[0] = workgroup_size; - local_size[1] = 1; - } - else { - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - local_size[0] = local_size[1] = sqrt_workgroup_size; - } - - /* Some implementations have max size 1 on 2nd dimension. */ - if (local_size[1] > max_work_items[1]) { - local_size[0] = workgroup_size / max_work_items[1]; - local_size[1] = max_work_items[1]; - } - - size_t global_size[2] = {global_size_round_up(local_size[0], w), - global_size_round_up(local_size[1], h)}; - - /* Vertical size of 1 is coming from bake/shade kernels where we should - * not round anything up because otherwise we'll either be doing too - * much work per pixel (if we don't check global ID on Y axis) or will - * be checking for global ID to always have Y of 0. - */ - if (h == 1) { - global_size[h] = 1; - } - - /* run kernel */ - opencl_assert( - clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); - opencl_assert(clFlush(cqCommandQueue)); -} - -void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) -{ - cl_mem ptr; - - MemMap::iterator i = mem_map.find(name); - if (i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - } - else { - ptr = 0; - } - - opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr)); -} - -void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) -{ - flush_texture_buffers(); - - memory_manager.set_kernel_arg_buffers(kernel, narg); -} - -void OpenCLDevice::flush_texture_buffers() -{ - if (!textures_need_update) { - return; - } - textures_need_update = false; - - /* Setup slots for textures. */ - int num_slots = 0; - - vector<texture_slot_t> texture_slots; - -# define KERNEL_TEX(type, name) \ - if (textures.find(#name) != textures.end()) { \ - texture_slots.push_back(texture_slot_t(#name, num_slots)); \ - } \ - num_slots++; -# include "kernel/kernel_textures.h" - - int num_data_slots = num_slots; - - foreach (TexturesMap::value_type &tex, textures) { - string name = tex.first; - device_memory *mem = tex.second; - - if (mem->type == MEM_TEXTURE) { - const uint id = ((device_texture *)mem)->slot; - texture_slots.push_back(texture_slot_t(name, num_data_slots + id)); - num_slots = max(num_slots, num_data_slots + id + 1); - } - } - - /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_info); - texture_info.resize(num_slots); - memory_manager.alloc("texture_info", texture_info); - - /* Fill in descriptors */ - foreach (texture_slot_t &slot, texture_slots) { - device_memory *mem = textures[slot.name]; - TextureInfo &info = texture_info[slot.slot]; - - MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - - if (mem->type == MEM_TEXTURE) { - info = ((device_texture *)mem)->info; - } - else { - memset(&info, 0, sizeof(TextureInfo)); - } - - info.data = desc.offset; - info.cl_buffer = desc.device_buffer; - } - - /* Force write of descriptors. */ - memory_manager.free(texture_info); - memory_manager.alloc("texture_info", texture_info); -} - -void OpenCLDevice::thread_run(DeviceTask &task) -{ - flush_texture_buffers(); - - if (task.type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, task); - - /* Allocate buffer for kernel globals */ - device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - /* Keep rendering tiles until done. */ - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - assert(tile.task == RenderTile::PATH_TRACE); - scoped_timer timer(&tile.buffers->render_time); - - split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if (tile.task == RenderTile::BAKE) { - bake(task, tile); - } - else if (tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - } - - kgbuffer.free(); - } - else if (task.type == DeviceTask::SHADER) { - shader(task); - } - else if (task.type == DeviceTask::FILM_CONVERT) { - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else if (task.type == DeviceTask::DENOISE_BUFFER) { - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - DenoisingTask denoising(this, task); - denoise(tile, denoising); - task.update_progress(&tile, tile.w * tile.h); - } -} - -void OpenCLDevice::film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half); - cl_mem d_buffer = CL_MEM_PTR(buffer); - cl_int d_x = task.x; - cl_int d_y = task.y; - cl_int d_w = task.w; - cl_int d_h = task.h; - cl_float d_sample_scale = 1.0f / (task.sample + 1); - cl_int d_offset = task.offset; - cl_int d_stride = task.stride; - - cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) : - base_program(ustring("convert_to_half_float")); - - cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer); - - set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckFilmConvertKernel, - start_arg_index, - d_sample_scale, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckFilmConvertKernel, d_w, d_h); -} - -bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) -{ - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); - device_sub_ptr blurDifference( - task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); - device_sub_ptr weightAccum( - task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride); - cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem out_mem = CL_MEM_PTR(out_ptr); - cl_mem scale_mem = NULL; - - mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride); - mem_zero_kernel(out_ptr, sizeof(float) * pass_stride); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); - cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - - kernel_set_args(ckNLMCalcDifference, - 0, - guide_mem, - variance_mem, - scale_mem, - difference_mem, - w, - h, - stride, - pass_stride, - r, - channel_offset, - 0, - a, - k_2); - kernel_set_args( - ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f); - kernel_set_args( - ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f); - kernel_set_args(ckNLMUpdateOutput, - 0, - blurDifference_mem, - image_mem, - out_mem, - weightAccum_mem, - w, - h, - stride, - pass_stride, - channel_offset, - r, - f); - - enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true); - - kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride); - enqueue_kernel(ckNLMNormalize, w, h); - - return true; -} - -bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) -{ - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - char use_time = task->buffer.use_time ? 1 : 0; - - cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - - int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterConstructTransform, - arg_ofs, - transform_mem, - rank_mem, - task->filter_area, - task->rect, - task->buffer.pass_stride, - task->buffer.frame_stride, - use_time, - task->radius, - task->pca_threshold); - - enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256); - - return true; -} - -bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) -{ - cl_mem color_mem = CL_MEM_PTR(color_ptr); - cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem scale_mem = CL_MEM_PTR(scale_ptr); - - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - char use_time = task->buffer.use_time ? 1 : 0; - - int r = task->radius; - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); - device_sub_ptr blurDifference( - task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - kernel_set_args(ckNLMCalcDifference, - 0, - color_mem, - color_variance_mem, - scale_mem, - difference_mem, - w, - h, - stride, - pass_stride, - r, - pass_stride, - frame_offset, - 1.0f, - task->nlm_k_2); - kernel_set_args( - ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4); - kernel_set_args( - ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4); - kernel_set_args(ckNLMConstructGramian, - 0, - t, - blurDifference_mem, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->reconstruction_state.filter_window, - w, - h, - stride, - pass_stride, - r, - 4, - frame_offset, - use_time); - - enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); - enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256); - - return true; -} - -bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) -{ - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - - cl_mem output_mem = CL_MEM_PTR(output_ptr); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - - kernel_set_args(ckFinalize, - 0, - output_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->filter_area, - task->reconstruction_state.buffer_params, - task->render_buffer.samples); - enqueue_kernel(ckFinalize, w, h); - - return true; -} - -bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); - - kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r); - enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) -{ - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); - cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); - cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - - int arg_ofs = kernel_set_args( - ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterDivideShadow, - arg_ofs, - a_mem, - b_mem, - sample_variance_mem, - sv_variance_mem, - buffer_variance_mem, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) -{ - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - - int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem); - cl_mem buffers[9]; - for (int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]); - } - kernel_set_args(ckFilterGetFeature, - arg_ofs, - mean_offset, - variance_offset, - mean_mem, - variance_mem, - scale, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -bool OpenCLDevice::denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) -{ - cl_mem from_mem = CL_MEM_PTR(from_ptr); - cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); - - cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); - - kernel_set_args(ckFilterWriteFeature, - 0, - task->render_buffer.samples, - task->reconstruction_state.buffer_params, - task->filter_area, - from_mem, - buffer_mem, - out_offset, - task->rect); - enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - return true; -} - -bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) -{ - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem depth_mem = CL_MEM_PTR(depth_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); - - cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); - - kernel_set_args(ckFilterDetectOutliers, - 0, - image_mem, - variance_mem, - depth_mem, - output_mem, - task->rect, - task->buffer.pass_stride); - enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - return true; -} - -void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising) -{ - denoising.functions.construct_transform = function_bind( - &OpenCLDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(rtile); -} - -void OpenCLDevice::shader(DeviceTask &task) -{ - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_input = CL_MEM_PTR(task.shader_input); - cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_int d_shader_eval_type = task.shader_eval_type; - cl_int d_shader_filter = task.shader_filter; - cl_int d_shader_x = task.shader_x; - cl_int d_shader_w = task.shader_w; - cl_int d_offset = task.offset; - - OpenCLDevice::OpenCLProgram *program = &background_program; - if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { - program = &displace_program; - } - program->wait_for_availability(); - cl_kernel kernel = (*program)(); - - cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type); - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset); - - for (int sample = 0; sample < task.num_samples; sample++) { - - if (task.get_cancel()) - break; - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, task.shader_w, 1); - - clFinish(cqCommandQueue); - - task.update_progress(NULL); - } -} - -void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) -{ - scoped_timer timer(&rtile.buffers->render_time); - - /* Cast arguments to cl types. */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - bake_program.wait_for_availability(); - cl_kernel kernel = bake_program(); - - cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args( - kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride); - - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample; sample++) { - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, d_w, d_h); - clFinish(cqCommandQueue); - - rtile.sample = sample + 1; - - task.update_progress(&rtile, rtile.w * rtile.h); - } -} - -static bool kernel_build_opencl_2(cl_device_id cdDevice) -{ - /* Build with OpenCL 2.0 if available, this improves performance - * with AMD OpenCL drivers on Windows and Linux (legacy drivers). - * Note that OpenCL selects the highest 1.x version by default, - * only for 2.0 do we need the explicit compiler flag. */ - int version_major, version_minor; - if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) { - if (version_major >= 2) { - /* This appears to trigger a driver bug in Radeon RX cards with certain - * driver version, so don't use OpenCL 2.0 for those. */ - string device_name = OpenCLInfo::get_readable_device_name(cdDevice); - if (string_startswith(device_name, "Radeon RX 4") || - string_startswith(device_name, "Radeon (TM) RX 4") || - string_startswith(device_name, "Radeon RX 5") || - string_startswith(device_name, "Radeon (TM) RX 5")) { - char version[256] = ""; - int driver_major, driver_minor; - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) { - return !(driver_major == 3075 && driver_minor <= 12); - } - } - - return true; - } - } - - return false; -} - -string OpenCLDevice::kernel_build_options(const string *debug_src) -{ - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - - if (kernel_build_opencl_2(cdDevice)) { - build_options += "-cl-std=CL2.0 "; - } - - if (platform_name == "NVIDIA CUDA") { - build_options += - "-D__KERNEL_OPENCL_NVIDIA__ " - "-cl-nv-maxrregcount=32 " - "-cl-nv-verbose "; - - uint compute_capability_major, compute_capability_minor; - clGetDeviceInfo(cdDevice, - CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), - &compute_capability_major, - NULL); - clGetDeviceInfo(cdDevice, - CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), - &compute_capability_minor, - NULL); - - build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", - compute_capability_major * 100 + compute_capability_minor * 10); - } - - else if (platform_name == "Apple") - build_options += "-D__KERNEL_OPENCL_APPLE__ "; - - else if (platform_name == "AMD Accelerated Parallel Processing") - build_options += "-D__KERNEL_OPENCL_AMD__ "; - - else if (platform_name == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; - - /* Options for gdb source level kernel debugging. - * this segfaults on linux currently. - */ - if (OpenCLInfo::use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\" "; - } - - if (info.has_half_images) { - build_options += "-D__KERNEL_CL_KHR_FP16__ "; - } - - if (OpenCLInfo::use_debug()) { - build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - } - -# ifdef WITH_NANOVDB - if (info.has_nanovdb) { - build_options += "-DWITH_NANOVDB "; - } -# endif - - return build_options; -} - -/* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ -int OpenCLDevice::kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper &arg1, - const ArgumentWrapper &arg2, - const ArgumentWrapper &arg3, - const ArgumentWrapper &arg4, - const ArgumentWrapper &arg5, - const ArgumentWrapper &arg6, - const ArgumentWrapper &arg7, - const ArgumentWrapper &arg8, - const ArgumentWrapper &arg9, - const ArgumentWrapper &arg10, - const ArgumentWrapper &arg11, - const ArgumentWrapper &arg12, - const ArgumentWrapper &arg13, - const ArgumentWrapper &arg14, - const ArgumentWrapper &arg15, - const ArgumentWrapper &arg16, - const ArgumentWrapper &arg17, - const ArgumentWrapper &arg18, - const ArgumentWrapper &arg19, - const ArgumentWrapper &arg20, - const ArgumentWrapper &arg21, - const ArgumentWrapper &arg22, - const ArgumentWrapper &arg23, - const ArgumentWrapper &arg24, - const ArgumentWrapper &arg25, - const ArgumentWrapper &arg26, - const ArgumentWrapper &arg27, - const ArgumentWrapper &arg28, - const ArgumentWrapper &arg29, - const ArgumentWrapper &arg30, - const ArgumentWrapper &arg31, - const ArgumentWrapper &arg32, - const ArgumentWrapper &arg33) -{ - int current_arg_index = 0; -# define FAKE_VARARG_HANDLE_ARG(arg) \ - do { \ - if (arg.pointer != NULL) { \ - opencl_assert(clSetKernelArg( \ - kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \ - ++current_arg_index; \ - } \ - else { \ - return current_arg_index; \ - } \ - } while (false) - FAKE_VARARG_HANDLE_ARG(arg1); - FAKE_VARARG_HANDLE_ARG(arg2); - FAKE_VARARG_HANDLE_ARG(arg3); - FAKE_VARARG_HANDLE_ARG(arg4); - FAKE_VARARG_HANDLE_ARG(arg5); - FAKE_VARARG_HANDLE_ARG(arg6); - FAKE_VARARG_HANDLE_ARG(arg7); - FAKE_VARARG_HANDLE_ARG(arg8); - FAKE_VARARG_HANDLE_ARG(arg9); - FAKE_VARARG_HANDLE_ARG(arg10); - FAKE_VARARG_HANDLE_ARG(arg11); - FAKE_VARARG_HANDLE_ARG(arg12); - FAKE_VARARG_HANDLE_ARG(arg13); - FAKE_VARARG_HANDLE_ARG(arg14); - FAKE_VARARG_HANDLE_ARG(arg15); - FAKE_VARARG_HANDLE_ARG(arg16); - FAKE_VARARG_HANDLE_ARG(arg17); - FAKE_VARARG_HANDLE_ARG(arg18); - FAKE_VARARG_HANDLE_ARG(arg19); - FAKE_VARARG_HANDLE_ARG(arg20); - FAKE_VARARG_HANDLE_ARG(arg21); - FAKE_VARARG_HANDLE_ARG(arg22); - FAKE_VARARG_HANDLE_ARG(arg23); - FAKE_VARARG_HANDLE_ARG(arg24); - FAKE_VARARG_HANDLE_ARG(arg25); - FAKE_VARARG_HANDLE_ARG(arg26); - FAKE_VARARG_HANDLE_ARG(arg27); - FAKE_VARARG_HANDLE_ARG(arg28); - FAKE_VARARG_HANDLE_ARG(arg29); - FAKE_VARARG_HANDLE_ARG(arg30); - FAKE_VARARG_HANDLE_ARG(arg31); - FAKE_VARARG_HANDLE_ARG(arg32); - FAKE_VARARG_HANDLE_ARG(arg33); -# undef FAKE_VARARG_HANDLE_ARG - return current_arg_index; -} - -void OpenCLDevice::release_kernel_safe(cl_kernel kernel) -{ - if (kernel) { - clReleaseKernel(kernel); - } -} - -void OpenCLDevice::release_mem_object_safe(cl_mem mem) -{ - if (mem != NULL) { - clReleaseMemObject(mem); - } -} - -void OpenCLDevice::release_program_safe(cl_program program) -{ - if (program) { - clReleaseProgram(program); - } -} - -/* ** Those guys are for working around some compiler-specific bugs ** */ - -cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker) -{ - return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker); -} - -void OpenCLDevice::store_cached_kernel(cl_program program, - ustring key, - thread_scoped_lock &cache_locker) -{ - OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker); -} - -Device *opencl_create_split_device(DeviceInfo &info, - Stats &stats, - Profiler &profiler, - bool background) -{ - return new OpenCLDevice(info, stats, profiler, background); -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp deleted file mode 100644 index 4330e07cb37..00000000000 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "util/util_foreach.h" - -# include "device/opencl/device_opencl.h" -# include "device/opencl/memory_manager.h" - -CCL_NAMESPACE_BEGIN - -void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation) -{ - allocations.push_back(&allocation); -} - -void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device) -{ - bool need_realloc = false; - - /* Calculate total size and remove any freed. */ - size_t total_size = 0; - - for (int i = allocations.size() - 1; i >= 0; i--) { - Allocation *allocation = allocations[i]; - - /* Remove allocations that have been freed. */ - if (!allocation->mem || allocation->mem->memory_size() == 0) { - allocation->device_buffer = NULL; - allocation->size = 0; - - allocations.erase(allocations.begin() + i); - - need_realloc = true; - - continue; - } - - /* Get actual size for allocation. */ - size_t alloc_size = align_up(allocation->mem->memory_size(), 16); - - if (allocation->size != alloc_size) { - /* Allocation is either new or resized. */ - allocation->size = alloc_size; - allocation->needs_copy_to_device = true; - - need_realloc = true; - } - - total_size += alloc_size; - } - - /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */ - total_size = std::max(total_size, (size_t)16); - - if (need_realloc) { - cl_ulong max_buffer_size; - clGetDeviceInfo( - device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if (total_size > max_buffer_size) { - device->set_error("Scene too complex to fit in available memory."); - return; - } - - device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device, - "memory manager buffer"); - - new_buffer->alloc_to_device(total_size); - - size_t offset = 0; - - foreach (Allocation *allocation, allocations) { - if (allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, - clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(new_buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, - NULL, - NULL)); - - allocation->needs_copy_to_device = false; - } - else { - /* Fast copy from memory already on device. */ - opencl_device_assert(device, - clEnqueueCopyBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_MEM_PTR(new_buffer->device_pointer), - allocation->desc.offset, - offset, - allocation->mem->memory_size(), - 0, - NULL, - NULL)); - } - - allocation->desc.offset = offset; - offset += allocation->size; - } - - delete buffer; - - buffer = new_buffer; - } - else { - assert(total_size == buffer->data_size); - - size_t offset = 0; - - foreach (Allocation *allocation, allocations) { - if (allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, - clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, - NULL, - NULL)); - - allocation->needs_copy_to_device = false; - } - - offset += allocation->size; - } - } - - /* Not really necessary, but seems to improve responsiveness for some reason. */ - clFinish(device->cqCommandQueue); -} - -void MemoryManager::DeviceBuffer::free(OpenCLDevice *) -{ - buffer->free(); -} - -MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer() -{ - DeviceBuffer *smallest = device_buffers; - - foreach (DeviceBuffer &device_buffer, device_buffers) { - if (device_buffer.size < smallest->size) { - smallest = &device_buffer; - } - } - - return smallest; -} - -MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false) -{ - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer"); - } -} - -void MemoryManager::free() -{ - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.free(device); - } -} - -void MemoryManager::alloc(const char *name, device_memory &mem) -{ - Allocation &allocation = allocations[name]; - - allocation.mem = &mem; - allocation.needs_copy_to_device = true; - - if (!allocation.device_buffer) { - DeviceBuffer *device_buffer = smallest_device_buffer(); - allocation.device_buffer = device_buffer; - - allocation.desc.device_buffer = device_buffer - device_buffers; - - device_buffer->add_allocation(allocation); - - device_buffer->size += mem.memory_size(); - } - - need_update = true; -} - -bool MemoryManager::free(device_memory &mem) -{ - foreach (AllocationsMap::value_type &value, allocations) { - Allocation &allocation = value.second; - if (allocation.mem == &mem) { - - allocation.device_buffer->size -= mem.memory_size(); - - allocation.mem = NULL; - allocation.needs_copy_to_device = false; - - need_update = true; - return true; - } - } - - return false; -} - -MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) -{ - update_device_memory(); - - Allocation &allocation = allocations[name]; - return allocation.desc; -} - -void MemoryManager::update_device_memory() -{ - if (!need_update) { - return; - } - - need_update = false; - - foreach (DeviceBuffer &device_buffer, device_buffers) { - device_buffer.update_device_memory(device); - } -} - -void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) -{ - update_device_memory(); - - foreach (DeviceBuffer &device_buffer, device_buffers) { - if (device_buffer.buffer->device_pointer) { - device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); - } - else { - device->kernel_set_args(kernel, (*narg)++); - } - } -} - -CCL_NAMESPACE_END - -#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h deleted file mode 100644 index 23624f837a6..00000000000 --- a/intern/cycles/device/opencl/memory_manager.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "device/device.h" - -#include "util/util_map.h" -#include "util/util_string.h" -#include "util/util_vector.h" - -#include "clew.h" - -CCL_NAMESPACE_BEGIN - -class OpenCLDevice; - -class MemoryManager { - public: - static const int NUM_DEVICE_BUFFERS = 8; - - struct BufferDescriptor { - uint device_buffer; - cl_ulong offset; - }; - - private: - struct DeviceBuffer; - - struct Allocation { - device_memory *mem; - - DeviceBuffer *device_buffer; - size_t size; /* Size of actual allocation, may be larger than requested. */ - - BufferDescriptor desc; - - bool needs_copy_to_device; - - Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) - { - } - }; - - struct DeviceBuffer { - device_only_memory<uchar> *buffer; - vector<Allocation *> allocations; - size_t size; /* Size of all allocations. */ - - DeviceBuffer() : buffer(NULL), size(0) - { - } - - ~DeviceBuffer() - { - delete buffer; - buffer = NULL; - } - - void add_allocation(Allocation &allocation); - - void update_device_memory(OpenCLDevice *device); - - void free(OpenCLDevice *device); - }; - - OpenCLDevice *device; - - DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; - - typedef unordered_map<string, Allocation> AllocationsMap; - AllocationsMap allocations; - - bool need_update; - - DeviceBuffer *smallest_device_buffer(); - - public: - MemoryManager(OpenCLDevice *device); - - void free(); /* Free all memory. */ - - void alloc(const char *name, device_memory &mem); - bool free(device_memory &mem); - - BufferDescriptor get_descriptor(string name); - - void update_device_memory(); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); -}; - -CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp deleted file mode 100644 index 3929cf77f15..00000000000 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ /dev/null @@ -1,1326 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_OPENCL - -# include "device/device_intern.h" -# include "device/opencl/device_opencl.h" - -# include "util/util_debug.h" -# include "util/util_logging.h" -# include "util/util_md5.h" -# include "util/util_path.h" -# include "util/util_semaphore.h" -# include "util/util_system.h" -# include "util/util_time.h" - -using std::cerr; -using std::endl; - -CCL_NAMESPACE_BEGIN - -OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL) -{ -} - -OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs) - : program(rhs.program), mutex(NULL) -{ -} - -OpenCLCache::Slot::ProgramEntry::~ProgramEntry() -{ - delete mutex; -} - -OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL) -{ -} - -OpenCLCache::Slot::Slot(const Slot &rhs) - : context_mutex(NULL), context(NULL), programs(rhs.programs) -{ -} - -OpenCLCache::Slot::~Slot() -{ - delete context_mutex; -} - -OpenCLCache &OpenCLCache::global_instance() -{ - static OpenCLCache instance; - return instance; -} - -cl_context OpenCLCache::get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - pair<CacheMap::iterator, bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - - Slot &slot = ins.first->second; - - /* create slot lock only while holding cache lock */ - if (!slot.context_mutex) - slot.context_mutex = new thread_mutex; - - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); - - /* lock the slot */ - slot_locker = thread_scoped_lock(*slot.context_mutex); - - /* If the thing isn't cached */ - if (slot.context == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } - - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); - - cl_int ciErr = clRetainContext(slot.context); - assert(ciErr == CL_SUCCESS); - (void)ciErr; - - return slot.context; -} - -cl_program OpenCLCache::get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - pair<CacheMap::iterator, bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - - Slot &slot = ins.first->second; - - pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert( - Slot::EntryMap::value_type(key, Slot::ProgramEntry())); - - Slot::ProgramEntry &entry = ins2.first->second; - - /* create slot lock only while holding cache lock */ - if (!entry.mutex) - entry.mutex = new thread_mutex; - - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); - - /* lock the slot */ - slot_locker = thread_scoped_lock(*entry.mutex); - - /* If the thing isn't cached */ - if (entry.program == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } - - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); - - cl_int ciErr = clRetainProgram(entry.program); - assert(ciErr == CL_SUCCESS); - (void)ciErr; - - return entry.program; -} - -void OpenCLCache::store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - assert(device != NULL); - assert(context != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - cache_lock.unlock(); - - Slot &slot = i->second; - - /* sanity check */ - assert(i != self.cache.end()); - assert(slot.context == NULL); - - slot.context = context; - - /* unlock the slot */ - slot_locker.unlock(); - - /* increment reference count in OpenCL. - * The caller is going to release the object when done with it. */ - cl_int ciErr = clRetainContext(context); - assert(ciErr == CL_SUCCESS); - (void)ciErr; -} - -void OpenCLCache::store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock &slot_locker) -{ - assert(platform != NULL); - assert(device != NULL); - assert(program != NULL); - - OpenCLCache &self = global_instance(); - - thread_scoped_lock cache_lock(self.cache_lock); - - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - assert(i != self.cache.end()); - Slot &slot = i->second; - - Slot::EntryMap::iterator i2 = slot.programs.find(key); - assert(i2 != slot.programs.end()); - Slot::ProgramEntry &entry = i2->second; - - assert(entry.program == NULL); - - cache_lock.unlock(); - - entry.program = program; - - /* unlock the slot */ - slot_locker.unlock(); - - /* Increment reference count in OpenCL. - * The caller is going to release the object when done with it. - */ - cl_int ciErr = clRetainProgram(program); - assert(ciErr == CL_SUCCESS); - (void)ciErr; -} - -string OpenCLCache::get_kernel_md5() -{ - OpenCLCache &self = global_instance(); - thread_scoped_lock lock(self.kernel_md5_lock); - - if (self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("source")); - } - return self.kernel_md5; -} - -static string get_program_source(const string &kernel_file) -{ - string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; - /* We compile kernels consisting of many files. unfortunately OpenCL - * kernel caches do not seem to recognize changes in included files. - * so we force recompile on changes by adding the md5 hash of all files. - */ - source = path_source_replace_includes(source, path_get("source")); - source += "\n// " + util_md5_string(source) + "\n"; - return source; -} - -OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device, - const string &program_name, - const string &kernel_file, - const string &kernel_build_options, - bool use_stdout) - : device(device), - program_name(program_name), - kernel_file(kernel_file), - kernel_build_options(kernel_build_options), - use_stdout(use_stdout) -{ - loaded = false; - needs_compiling = true; - program = NULL; -} - -OpenCLDevice::OpenCLProgram::~OpenCLProgram() -{ - release(); -} - -void OpenCLDevice::OpenCLProgram::release() -{ - for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); - ++kernel) { - if (kernel->second) { - clReleaseKernel(kernel->second); - kernel->second = NULL; - } - } - if (program) { - clReleaseProgram(program); - program = NULL; - } -} - -void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug) -{ - if (!use_stdout) { - log += msg + "\n"; - } - else if (!debug) { - printf("%s\n", msg.c_str()); - fflush(stdout); - } - else { - VLOG(2) << msg; - } -} - -void OpenCLDevice::OpenCLProgram::add_error(const string &msg) -{ - if (use_stdout) { - fprintf(stderr, "%s\n", msg.c_str()); - } - if (error_msg == "") { - error_msg += "\n"; - } - error_msg += msg; -} - -void OpenCLDevice::OpenCLProgram::add_kernel(ustring name) -{ - if (!kernels.count(name)) { - kernels[name] = NULL; - } -} - -bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src) -{ - string build_options; - build_options = device->kernel_build_options(debug_src) + kernel_build_options; - - VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'."; - cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - /* show warnings even if build is successful */ - size_t ret_val_size = 0; - - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - - if (ciErr != CL_SUCCESS) { - add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + - ", errors in console."); - } - - if (ret_val_size > 1) { - vector<char> build_log(ret_val_size + 1); - clGetProgramBuildInfo( - program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); - - build_log[ret_val_size] = '\0'; - /* Skip meaningless empty output from the NVidia compiler. */ - if (!(ret_val_size == 2 && build_log[0] == '\n')) { - add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), - ciErr == CL_SUCCESS); - } - } - - return (ciErr == CL_SUCCESS); -} - -bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src) -{ - string source = get_program_source(kernel_file); - - if (debug_src) { - path_write_text(*debug_src, source); - } - - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_int ciErr; - - program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr); - - if (ciErr != CL_SUCCESS) { - add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); - return false; - } - - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - - if (!build_kernel(debug_src)) - return false; - - double elapsed = time_dt() - starttime; - add_log( - string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), - false); - - return true; -} - -static void escape_python_string(string &str) -{ - /* Escape string to be passed as a Python raw string with '' quotes'. */ - string_replace(str, "'", "\'"); -} - -static int opencl_compile_process_limit() -{ - /* Limit number of concurrent processes compiling, with a heuristic based - * on total physical RAM and estimate of memory usage needed when compiling - * with all Cycles features enabled. - * - * This is somewhat arbitrary as we don't know the actual available RAM or - * how much the kernel compilation will needed depending on the features, but - * better than not limiting at all. */ - static const int64_t GB = 1024LL * 1024LL * 1024LL; - static const int64_t process_memory = 2 * GB; - static const int64_t base_memory = 2 * GB; - static const int64_t system_memory = system_physical_ram(); - static const int64_t process_limit = (system_memory - base_memory) / process_memory; - - return max((int)process_limit, 1); -} - -bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin) -{ - /* Construct arguments. */ - vector<string> args; - args.push_back("--background"); - args.push_back("--factory-startup"); - args.push_back("--python-expr"); - - int device_platform_id = device->device_num; - string device_name = device->device_name; - string platform_name = device->platform_name; - string build_options = device->kernel_build_options(NULL) + kernel_build_options; - string kernel_file_escaped = kernel_file; - string clbin_escaped = clbin; - - escape_python_string(device_name); - escape_python_string(platform_name); - escape_python_string(build_options); - escape_python_string(kernel_file_escaped); - escape_python_string(clbin_escaped); - - args.push_back(string_printf( - "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", - device_platform_id, - device_name.c_str(), - platform_name.c_str(), - build_options.c_str(), - kernel_file_escaped.c_str(), - clbin_escaped.c_str())); - - /* Limit number of concurrent processes compiling. */ - static thread_counting_semaphore semaphore(opencl_compile_process_limit()); - semaphore.acquire(); - - /* Compile. */ - const double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - const bool success = system_call_self(args); - const double elapsed = time_dt() - starttime; - - semaphore.release(); - - if (!success || !path_exists(clbin)) { - return false; - } - - add_log( - string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), - false); - - return load_binary(clbin); -} - -/* Compile opencl kernel. This method is called from the _cycles Python - * module compile kernels. Parameters must match function above. */ -bool device_opencl_compile_kernel(const vector<string> ¶meters) -{ - int device_platform_id = std::stoi(parameters[0]); - const string &device_name = parameters[1]; - const string &platform_name = parameters[2]; - const string &build_options = parameters[3]; - const string &kernel_file = parameters[4]; - const string &binary_path = parameters[5]; - - if (clewInit() != CLEW_SUCCESS) { - return false; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if (device_platform_id >= usable_devices.size()) { - return false; - } - - OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id]; - if (platform_device.platform_name != platform_name || - platform_device.device_name != device_name) { - return false; - } - - cl_platform_id platform = platform_device.platform_id; - cl_device_id device = platform_device.device_id; - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0}; - - cl_int err; - cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); - if (err != CL_SUCCESS) { - return false; - } - - string source = get_program_source(kernel_file); - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); - bool result = false; - - if (err == CL_SUCCESS) { - err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - if (err == CL_SUCCESS) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if (size > 0) { - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - result = path_write_binary(binary_path, binary); - } - } - clReleaseProgram(program); - } - - clReleaseContext(context); - - return result; -} - -bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src) -{ - /* read binary into memory */ - vector<uint8_t> binary; - - if (!path_read_binary(clbin, binary)) { - add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); - return false; - } - - /* create program */ - cl_int status, ciErr; - size_t size = binary.size(); - const uint8_t *bytes = &binary[0]; - - program = clCreateProgramWithBinary( - device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr); - - if (status != CL_SUCCESS || ciErr != CL_SUCCESS) { - add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " + - clewErrorString(status) + " " + clewErrorString(ciErr)); - return false; - } - - if (!build_kernel(debug_src)) - return false; - - return true; -} - -bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin) -{ - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - - if (!size) - return false; - - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - - return path_write_binary(clbin, binary); -} - -bool OpenCLDevice::OpenCLProgram::load() -{ - loaded = false; - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, cache_locker); - if (!program) { - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + - util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* If binary kernel exists already, try use it. */ - if (path_exists(clbin) && load_binary(clbin)) { - /* Kernel loaded from binary, nothing to do. */ - add_log(string("Loaded program from ") + clbin + ".", true); - - /* Cache the program. */ - device->store_cached_kernel(program, cache_key, cache_locker); - } - else { - add_log(string("OpenCL program ") + program_name + " not found on disk.", true); - cache_locker.unlock(); - } - } - - if (program) { - create_kernels(); - loaded = true; - needs_compiling = false; - } - - return loaded; -} - -void OpenCLDevice::OpenCLProgram::compile() -{ - assert(device); - - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, cache_locker); - - if (!program) { - - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + - util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* path to preprocessed source for debugging */ - string clsrc, *debug_src = NULL; - - if (OpenCLInfo::use_debug()) { - clsrc = basename + ".cl"; - debug_src = &clsrc; - } - - if (DebugFlags().running_inside_blender && compile_separate(clbin)) { - add_log(string("Built and loaded program from ") + clbin + ".", true); - loaded = true; - } - else { - if (DebugFlags().running_inside_blender) { - add_log(string("Separate-process building of ") + clbin + - " failed, will fall back to regular building.", - true); - } - - /* If does not exist or loading binary failed, compile kernel. */ - if (!compile_kernel(debug_src)) { - needs_compiling = false; - return; - } - - /* Save binary for reuse. */ - if (!save_binary(clbin)) { - add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); - } - } - - /* Cache the program. */ - device->store_cached_kernel(program, cache_key, cache_locker); - } - - create_kernels(); - needs_compiling = false; - loaded = true; -} - -void OpenCLDevice::OpenCLProgram::create_kernels() -{ - for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); - ++kernel) { - assert(kernel->second == NULL); - cl_int ciErr; - string name = "kernel_ocl_" + kernel->first.string(); - kernel->second = clCreateKernel(program, name.c_str(), &ciErr); - if (device->opencl_error(ciErr)) { - add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + - clewErrorString(ciErr)); - return; - } - } -} - -bool OpenCLDevice::OpenCLProgram::wait_for_availability() -{ - add_log(string("Waiting for availability of ") + program_name + ".", true); - while (needs_compiling) { - time_sleep(0.1); - } - return loaded; -} - -void OpenCLDevice::OpenCLProgram::report_error() -{ - /* If loaded is true, there was no error. */ - if (loaded) - return; - /* if use_stdout is true, the error was already reported. */ - if (use_stdout) - return; - - cerr << error_msg << endl; - if (!compile_output.empty()) { - cerr << "OpenCL kernel build output for " << program_name << ":" << endl; - cerr << compile_output << endl; - } -} - -cl_kernel OpenCLDevice::OpenCLProgram::operator()() -{ - assert(kernels.size() == 1); - return kernels.begin()->second; -} - -cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name) -{ - assert(kernels.count(name)); - return kernels[name]; -} - -cl_device_type OpenCLInfo::device_type() -{ - switch (DebugFlags().opencl.device_type) { - case DebugFlags::OpenCL::DEVICE_NONE: - return 0; - case DebugFlags::OpenCL::DEVICE_ALL: - return CL_DEVICE_TYPE_ALL; - case DebugFlags::OpenCL::DEVICE_DEFAULT: - return CL_DEVICE_TYPE_DEFAULT; - case DebugFlags::OpenCL::DEVICE_CPU: - return CL_DEVICE_TYPE_CPU; - case DebugFlags::OpenCL::DEVICE_GPU: - return CL_DEVICE_TYPE_GPU; - case DebugFlags::OpenCL::DEVICE_ACCELERATOR: - return CL_DEVICE_TYPE_ACCELERATOR; - default: - return CL_DEVICE_TYPE_ALL; - } -} - -bool OpenCLInfo::use_debug() -{ - return DebugFlags().opencl.debug; -} - -bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id) -{ - cl_device_type device_type; - if (!get_device_type(device_id, &device_type)) { - return false; - } - string device_name; - if (!get_device_name(device_id, &device_name)) { - return false; - } - - int driver_major = 0; - int driver_minor = 0; - if (!get_driver_version(device_id, &driver_major, &driver_minor)) { - return false; - } - VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; - - if (getenv("CYCLES_OPENCL_TEST")) { - return true; - } - - /* Allow Intel GPUs on Intel OpenCL platform. */ - if (platform_name.find("Intel") != string::npos) { - if (device_type != CL_DEVICE_TYPE_GPU) { - /* OpenCL on Intel CPU is not an officially supported configuration. - * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */ - return false; - } - -# ifdef __APPLE__ - /* Apple uses own framework, which can also put Iris onto AMD frame-work. - * This isn't supported configuration. */ - return false; -# else - if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) { - return true; - } -# endif - } - - if (platform_name == "AMD Accelerated Parallel Processing" && - device_type == CL_DEVICE_TYPE_GPU) { - if (driver_major < 2236) { - VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; - return false; - } - const char *blacklist[] = {/* GCN 1 */ - "Tahiti", - "Pitcairn", - "Capeverde", - "Oland", - "Hainan", - NULL}; - for (int i = 0; blacklist[i] != NULL; i++) { - if (device_name == blacklist[i]) { - VLOG(1) << "AMD device " << device_name << " not supported"; - return false; - } - } - return true; - } - if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { - return false; - } - return false; -} - -bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error) -{ - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { - if (error != NULL) { - *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); - } - return false; - } - if (!((major == req_major && minor >= req_minor) || (major > req_major))) { - if (error != NULL) { - *error = string_printf( - "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error) -{ - char version[256]; - clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL); - if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) { - if (error != NULL) { - *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -bool OpenCLInfo::device_version_check(cl_device_id device, string *error) -{ - const int req_major = 1, req_minor = 1; - int major, minor; - if (!get_device_version(device, &major, &minor, error)) { - return false; - } - - if (!((major == req_major && minor >= req_minor) || (major > req_major))) { - if (error != NULL) { - *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if (error != NULL) { - *error = ""; - } - return true; -} - -string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id) -{ - if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { - /* Use cl_amd_device_topology extension. */ - cl_char topology[24]; - if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && - topology[0] == 1) { - return string_printf("%02x:%02x.%01x", - (unsigned int)topology[21], - (unsigned int)topology[22], - (unsigned int)topology[23]); - } - } - else if (platform_name == "NVIDIA CUDA") { - /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ - cl_int bus_id, slot_id; - if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && - clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { - return string_printf("%02x:%02x.%01x", - (unsigned int)(bus_id), - (unsigned int)(slot_id >> 3), - (unsigned int)(slot_id & 0x7)); - } - } - /* No general way to get a hardware ID from OpenCL => give up. */ - return ""; -} - -void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices) -{ - const cl_device_type device_type = OpenCLInfo::device_type(); - static bool first_time = true; -# define FIRST_VLOG(severity) \ - if (first_time) \ - VLOG(severity) - - usable_devices->clear(); - - if (device_type == 0) { - FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; - first_time = false; - return; - } - - cl_int error; - vector<cl_device_id> device_ids; - vector<cl_platform_id> platform_ids; - - /* Get platforms. */ - if (!get_platforms(&platform_ids, &error)) { - FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error)); - first_time = false; - return; - } - if (platform_ids.size() == 0) { - FIRST_VLOG(2) << "No OpenCL platforms were found."; - first_time = false; - return; - } - /* Devices are numbered consecutively across platforms. */ - for (int platform = 0; platform < platform_ids.size(); platform++) { - cl_platform_id platform_id = platform_ids[platform]; - string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { - FIRST_VLOG(2) << "Failed to get platform name, ignoring."; - continue; - } - FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; - if (!platform_version_check(platform_id)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << " due to too old compiler version."; - continue; - } - if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch of devices: " << string(clewErrorString(error)); - continue; - } - if (device_ids.size() == 0) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices."; - continue; - } - for (int num = 0; num < device_ids.size(); num++) { - const cl_device_id device_id = device_ids[num]; - string device_name; - if (!get_device_name(device_id, &device_name, &error)) { - FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error)) - << ", ignoring."; - continue; - } - if (!device_version_check(device_id)) { - FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version."; - continue; - } - if (device_supported(platform_name, device_id)) { - cl_device_type device_type; - if (!get_device_type(device_id, &device_type, &error)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type:" << string(clewErrorString(error)); - continue; - } - string readable_device_name = get_readable_device_name(device_id); - if (readable_device_name != device_name) { - FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name; - } - FIRST_VLOG(2) << "Adding new device " << readable_device_name << "."; - string hardware_id = get_hardware_id(platform_name, device_id); - string device_extensions = get_device_extensions(device_id); - usable_devices->push_back(OpenCLPlatformDevice(platform_id, - platform_name, - device_id, - device_type, - readable_device_name, - hardware_id, - device_extensions)); - } - else { - FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet."; - } - } - } - first_time = false; -} - -bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error) -{ - /* Reset from possible previous state. */ - platform_ids->resize(0); - cl_uint num_platforms; - if (!get_num_platforms(&num_platforms, error)) { - return false; - } - /* Get actual platforms. */ - cl_int err; - platform_ids->resize(num_platforms); - if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -vector<cl_platform_id> OpenCLInfo::get_platforms() -{ - vector<cl_platform_id> platform_ids; - get_platforms(&platform_ids); - return platform_ids; -} - -bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) -{ - cl_int err; - if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *num_platforms = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_uint OpenCLInfo::get_num_platforms() -{ - cl_uint num_platforms; - if (!get_num_platforms(&num_platforms)) { - return 0; - } - return num_platforms; -} - -bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name) -{ - char buffer[256]; - if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) != - CL_SUCCESS) { - *platform_name = ""; - return false; - } - *platform_name = buffer; - return true; -} - -string OpenCLInfo::get_platform_name(cl_platform_id platform_id) -{ - string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { - return ""; - } - return platform_name; -} - -bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error) -{ - cl_int err; - if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *num_devices = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type) -{ - cl_uint num_devices; - if (!get_num_platform_devices(platform_id, device_type, &num_devices)) { - return 0; - } - return num_devices; -} - -bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int *error) -{ - /* Reset from possible previous state. */ - device_ids->resize(0); - /* Get number of devices to pre-allocate memory. */ - cl_uint num_devices; - if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) { - return false; - } - /* Get actual device list. */ - device_ids->resize(num_devices); - cl_int err; - if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type) -{ - vector<cl_device_id> devices; - get_platform_devices(platform_id, device_type, &devices); - return devices; -} - -bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error) -{ - char buffer[1024]; - cl_int err; - if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_name = ""; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - *device_name = buffer; - return true; -} - -string OpenCLInfo::get_device_name(cl_device_id device_id) -{ - string device_name; - if (!get_device_name(device_id, &device_name)) { - return ""; - } - return device_name; -} - -bool OpenCLInfo::get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int *error) -{ - size_t extension_length = 0; - cl_int err; - /* Determine the size of the extension string. */ - if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - vector<char> buffer(extension_length); - if ((err = clGetDeviceInfo( - device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - *device_extensions = string(buffer.data()); - return true; -} - -string OpenCLInfo::get_device_extensions(cl_device_id device_id) -{ - string device_extensions; - if (!get_device_extensions(device_id, &device_extensions)) { - return ""; - } - return device_extensions; -} - -bool OpenCLInfo::get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int *error) -{ - cl_int err; - if ((err = clGetDeviceInfo( - device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - *device_type = 0; - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - return true; -} - -cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) -{ - cl_device_type device_type; - if (!get_device_type(device_id, &device_type)) { - return 0; - } - return device_type; -} - -string OpenCLInfo::get_readable_device_name(cl_device_id device_id) -{ - string name = ""; - char board_name[1024]; - size_t length = 0; - if (clGetDeviceInfo( - device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) == - CL_SUCCESS) { - if (length != 0 && board_name[0] != '\0') { - name = board_name; - } - } - - /* Fallback to standard device name API. */ - if (name.empty()) { - name = get_device_name(device_id); - } - - /* Special exception for AMD Vega, need to be able to tell - * Vega 56 from 64 apart. - */ - if (name == "Radeon RX Vega") { - cl_int max_compute_units = 0; - if (clGetDeviceInfo(device_id, - CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(max_compute_units), - &max_compute_units, - NULL) == CL_SUCCESS) { - name += " " + to_string(max_compute_units); - } - } - - /* Distinguish from our native CPU device. */ - if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { - name += " (OpenCL)"; - } - - return name; -} - -bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error) -{ - char buffer[1024]; - cl_int err; - if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) != - CL_SUCCESS) { - if (error != NULL) { - *error = err; - } - return false; - } - if (error != NULL) { - *error = CL_SUCCESS; - } - if (sscanf(buffer, "%d.%d", major, minor) < 2) { - VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); - return false; - } - return true; -} - -int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id) -{ - int base_align_bits; - if (clGetDeviceInfo( - device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) == - CL_SUCCESS) { - return base_align_bits / 8; - } - return 1; -} - -CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp new file mode 100644 index 00000000000..13f23bd229a --- /dev/null +++ b/intern/cycles/device/optix/device.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/optix/device.h" + +#include "device/cuda/device.h" +#include "device/optix/device_impl.h" +#include "util/util_logging.h" + +#ifdef WITH_OPTIX +# include <optix_function_table_definition.h> +#endif + +CCL_NAMESPACE_BEGIN + +bool device_optix_init() +{ +#ifdef WITH_OPTIX + if (g_optixFunctionTable.optixDeviceContextCreate != NULL) { + /* Already initialized function table. */ + return true; + } + + /* Need to initialize CUDA as well. */ + if (!device_cuda_init()) { + return false; + } + + const OptixResult result = optixInit(); + + if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) { + VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. " + "Please update to the latest driver first!"; + return false; + } + else if (result != OPTIX_SUCCESS) { + VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result; + return false; + } + + /* Loaded OptiX successfully! */ + return true; +#else + return false; +#endif +} + +void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices) +{ +#ifdef WITH_OPTIX + devices.reserve(cuda_devices.size()); + + /* Simply add all supported CUDA devices as OptiX devices again. */ + for (DeviceInfo info : cuda_devices) { + assert(info.type == DEVICE_CUDA); + + int major; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); + if (major < 5) { + /* Only Maxwell and up are supported by OptiX. */ + continue; + } + + info.type = DEVICE_OPTIX; + info.id += "_OptiX"; + info.denoisers |= DENOISER_OPTIX; + + devices.push_back(info); + } +#else + (void)cuda_devices; + (void)devices; +#endif +} + +Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_OPTIX + return new OptiXDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h new file mode 100644 index 00000000000..29fa729c2e4 --- /dev/null +++ b/intern/cycles/device/optix/device.h @@ -0,0 +1,35 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_optix_init(); + +Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp new file mode 100644 index 00000000000..cd16b8c9f01 --- /dev/null +++ b/intern/cycles/device/optix/device_impl.cpp @@ -0,0 +1,1573 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "device/optix/device_impl.h" + +# include "bvh/bvh.h" +# include "bvh/bvh_optix.h" +# include "integrator/pass_accessor_gpu.h" +# include "render/buffers.h" +# include "render/hair.h" +# include "render/mesh.h" +# include "render/object.h" +# include "render/pass.h" +# include "render/scene.h" + +# include "util/util_debug.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_progress.h" +# include "util/util_time.h" + +# undef __KERNEL_CPU__ +# define __KERNEL_OPTIX__ +# include "kernel/device/optix/globals.h" + +CCL_NAMESPACE_BEGIN + +OptiXDevice::Denoiser::Denoiser(OptiXDevice *device) + : device(device), queue(device), state(device, "__denoiser_state") +{ +} + +OptiXDevice::Denoiser::~Denoiser() +{ + const CUDAContextScope scope(device); + if (optix_denoiser != nullptr) { + optixDenoiserDestroy(optix_denoiser); + } +} + +OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : CUDADevice(info, stats, profiler), + sbt_data(this, "__sbt", MEM_READ_ONLY), + launch_params(this, "__params"), + denoiser_(this) +{ + /* Make the CUDA context current. */ + if (!cuContext) { + /* Do not initialize if CUDA context creation failed already. */ + return; + } + const CUDAContextScope scope(this); + + /* Create OptiX context for this device. */ + OptixDeviceContextOptions options = {}; +# ifdef WITH_CYCLES_LOGGING + options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */ + options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) { + switch (level) { + case 1: + LOG_IF(FATAL, VLOG_IS_ON(1)) << message; + break; + case 2: + LOG_IF(ERROR, VLOG_IS_ON(1)) << message; + break; + case 3: + LOG_IF(WARNING, VLOG_IS_ON(1)) << message; + break; + case 4: + LOG_IF(INFO, VLOG_IS_ON(1)) << message; + break; + } + }; +# endif + if (DebugFlags().optix.use_debug) { + options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL; + } + optix_assert(optixDeviceContextCreate(cuContext, &options, &context)); +# ifdef WITH_CYCLES_LOGGING + optix_assert(optixDeviceContextSetLogCallback( + context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); +# endif + + /* Fix weird compiler bug that assigns wrong size. */ + launch_params.data_elements = sizeof(KernelParamsOptiX); + + /* Allocate launch parameter buffer memory on device. */ + launch_params.alloc_to_device(1); +} + +OptiXDevice::~OptiXDevice() +{ + /* Make CUDA context current. */ + const CUDAContextScope scope(this); + + free_bvh_memory_delayed(); + + sbt_data.free(); + texture_info.free(); + launch_params.free(); + + /* Unload modules. */ + if (optix_module != NULL) { + optixModuleDestroy(optix_module); + } + for (unsigned int i = 0; i < 2; ++i) { + if (builtin_modules[i] != NULL) { + optixModuleDestroy(builtin_modules[i]); + } + } + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + if (pipelines[i] != NULL) { + optixPipelineDestroy(pipelines[i]); + } + } + + optixDeviceContextDestroy(context); +} + +unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create() +{ + return make_unique<OptiXDeviceQueue>(this); +} + +BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const +{ + /* OptiX has its own internal acceleration structure format. */ + return BVH_LAYOUT_OPTIX; +} + +string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features); + + /* Add OptiX SDK include directory to include paths. */ + const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR"); + if (optix_sdk_path) { + common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path); + } + + /* Specialization for shader raytracing. */ + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + common_cflags += " --keep-device-functions"; + } + + return common_cflags; +} + +bool OptiXDevice::load_kernels(const uint kernel_features) +{ + if (have_error()) { + /* Abort early if context creation failed already. */ + return false; + } + + /* Load CUDA modules because we need some of the utility kernels. */ + if (!CUDADevice::load_kernels(kernel_features)) { + return false; + } + + /* Skip creating OptiX module if only doing denoising. */ + if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) { + return true; + } + + const CUDAContextScope scope(this); + + /* Unload existing OptiX module and pipelines first. */ + if (optix_module != NULL) { + optixModuleDestroy(optix_module); + optix_module = NULL; + } + for (unsigned int i = 0; i < 2; ++i) { + if (builtin_modules[i] != NULL) { + optixModuleDestroy(builtin_modules[i]); + builtin_modules[i] = NULL; + } + } + for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { + if (pipelines[i] != NULL) { + optixPipelineDestroy(pipelines[i]); + pipelines[i] = NULL; + } + } + + OptixModuleCompileOptions module_options = {}; + module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */ + + if (DebugFlags().optix.use_debug) { + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; + } + else { + module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + } + + module_options.boundValues = nullptr; + module_options.numBoundValues = 0; + + OptixPipelineCompileOptions pipeline_options = {}; + /* Default to no motion blur and two-level graph, since it is the fastest option. */ + pipeline_options.usesMotionBlur = false; + pipeline_options.traversableGraphFlags = + OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING; + pipeline_options.numPayloadValues = 6; + pipeline_options.numAttributeValues = 2; /* u, v */ + pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE; + pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */ + + pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; + if (kernel_features & KERNEL_FEATURE_HAIR) { + if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; + } + else + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; + } + + /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds + * This is necessary since objects may be reported to have motion if the Vector pass is + * active, but may still need to be rendered without motion blur if that isn't active as well. */ + motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0; + + if (motion_blur) { + pipeline_options.usesMotionBlur = true; + /* Motion blur can insert motion transforms into the traversal graph. + * It is no longer a two-level graph then, so need to set flags to allow any configuration. */ + pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; + } + + { /* Load and compile PTX module with OptiX kernels. */ + string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? + "lib/kernel_optix_shader_raytrace.ptx" : + "lib/kernel_optix.ptx"); + if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { + if (!getenv("OPTIX_ROOT_DIR")) { + set_error( + "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to " + "the Optix SDK to be able to compile Optix kernels on demand)."); + return false; + } + ptx_filename = compile_kernel( + kernel_features, + (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel", + "optix", + true); + } + if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) { + set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str())); + return false; + } + + const OptixResult result = optixModuleCreateFromPTX(context, + &module_options, + &pipeline_options, + ptx_data.data(), + ptx_data.size(), + nullptr, + 0, + &optix_module); + if (result != OPTIX_SUCCESS) { + set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)", + ptx_filename.c_str(), + optixGetErrorName(result))); + return false; + } + } + + /* Create program groups. */ + OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; + OptixProgramGroupOptions group_options = {}; /* There are no options currently. */ + group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_closest"; + group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_shadow"; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_subsurface"; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module; + group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_intersect_volume_stack"; + group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS; + group_descs[PG_MISS].miss.module = optix_module; + group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss"; + group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITD].hitgroup.moduleCH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; + group_descs[PG_HITD].hitgroup.moduleAH = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test"; + group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITS].hitgroup.moduleAH = optix_module; + group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; + + if (kernel_features & KERNEL_FEATURE_HAIR) { + if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { + /* Built-in thick curve intersection. */ + OptixBuiltinISOptions builtin_options = {}; + builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + builtin_options.usesMotionBlur = false; + + optix_assert(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); + + group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; + group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; + + if (motion_blur) { + builtin_options.usesMotionBlur = true; + + optix_assert(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); + + group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; + group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; + group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; + group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; + } + } + else { + /* Custom ribbon intersection. */ + group_descs[PG_HITD].hitgroup.moduleIS = optix_module; + group_descs[PG_HITS].hitgroup.moduleIS = optix_module; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + } + } + + if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) { + /* Add hit group for local intersections. */ + group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITL].hitgroup.moduleAH = optix_module; + group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit"; + } + + /* Shader raytracing replaces some functions with direct callables. */ + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_surface_raytrace"; + group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao"; + group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module; + group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC = + "__direct_callable__svm_node_bevel"; + group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES; + group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module; + group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass"; + } + + optix_assert(optixProgramGroupCreate( + context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); + + /* Get program stack sizes. */ + OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {}; + /* Set up SBT, which in this case is used only to select between different programs. */ + sbt_data.alloc(NUM_PROGRAM_GROUPS); + memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS); + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i])); + optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i])); + } + sbt_data.copy_to_device(); /* Upload SBT to device. */ + + /* Calculate maximum trace continuation stack size. */ + unsigned int trace_css = stack_size[PG_HITD].cssCH; + /* This is based on the maximum of closest-hit and any-hit/intersection programs. */ + trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); + trace_css = std::max(trace_css, + stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); + trace_css = std::max(trace_css, + stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); + + OptixPipelineLinkOptions link_options = {}; + link_options.maxTraceDepth = 1; + + if (DebugFlags().optix.use_debug) { + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; + } + else { + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + } + + if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + /* Create shader raytracing pipeline. */ + vector<OptixProgramGroup> pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } + pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); + pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); + + optix_assert(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_SHADE_RAYTRACE])); + + /* Combine ray generation and trace continuation stack size. */ + const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG + + link_options.maxTraceDepth * trace_css; + const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC, + stack_size[PG_CALL_SVM_BEVEL].dssDC); + + /* Set stack size depending on pipeline options. */ + optix_assert(optixPipelineSetStackSize( + pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2)); + } + + { /* Create intersection-only pipeline. */ + vector<OptixProgramGroup> pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]); + pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } + + optix_assert(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_INTERSECT])); + + /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */ + const unsigned int css = + std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG, + std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG, + stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) + + link_options.maxTraceDepth * trace_css; + + optix_assert( + optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2)); + } + + /* Clean up program group objects. */ + for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) { + optixProgramGroupDestroy(groups[i]); + } + + return true; +} + +/* -------------------------------------------------------------------- + * Buffer denoising. + */ + +class OptiXDevice::DenoiseContext { + public: + explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task) + : denoise_params(task.params), + render_buffers(task.render_buffers), + buffer_params(task.buffer_params), + guiding_buffer(device, "denoiser guiding passes buffer"), + num_samples(task.num_samples) + { + num_input_passes = 1; + if (denoise_params.use_pass_albedo) { + num_input_passes += 1; + use_pass_albedo = true; + pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO); + if (denoise_params.use_pass_normal) { + num_input_passes += 1; + use_pass_normal = true; + pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL); + } + } + + const int num_guiding_passes = num_input_passes - 1; + + if (num_guiding_passes) { + if (task.allow_inplace_modification) { + guiding_params.device_pointer = render_buffers->buffer.device_pointer; + + guiding_params.pass_albedo = pass_denoising_albedo; + guiding_params.pass_normal = pass_denoising_normal; + + guiding_params.stride = buffer_params.stride; + guiding_params.pass_stride = buffer_params.pass_stride; + } + else { + guiding_params.pass_stride = 0; + if (use_pass_albedo) { + guiding_params.pass_albedo = guiding_params.pass_stride; + guiding_params.pass_stride += 3; + } + if (use_pass_normal) { + guiding_params.pass_normal = guiding_params.pass_stride; + guiding_params.pass_stride += 3; + } + + guiding_params.stride = buffer_params.width; + + guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height * + guiding_params.pass_stride); + guiding_params.device_pointer = guiding_buffer.device_pointer; + } + } + + pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT); + } + + const DenoiseParams &denoise_params; + + RenderBuffers *render_buffers = nullptr; + const BufferParams &buffer_params; + + /* Device-side storage of the guiding passes. */ + device_only_memory<float> guiding_buffer; + + struct { + device_ptr device_pointer = 0; + + /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ + int pass_albedo = PASS_UNUSED; + int pass_normal = PASS_UNUSED; + + int stride = -1; + int pass_stride = -1; + } guiding_params; + + /* Number of input passes. Including the color and extra auxillary passes. */ + int num_input_passes = 0; + bool use_pass_albedo = false; + bool use_pass_normal = false; + + int num_samples = 0; + + int pass_sample_count = PASS_UNUSED; + + /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */ + int pass_denoising_albedo = PASS_UNUSED; + int pass_denoising_normal = PASS_UNUSED; + + /* For passes which don't need albedo channel for denoising we replace the actual albedo with + * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with + * the fake values and denoising of passes which do need albedo can no longer happen. */ + bool albedo_replaced_with_fake = false; +}; + +class OptiXDevice::DenoisePass { + public: + DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type) + { + noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY); + denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED); + + const PassInfo pass_info = Pass::get_info(type); + num_components = pass_info.num_components; + use_compositing = pass_info.use_compositing; + use_denoising_albedo = pass_info.use_denoising_albedo; + } + + PassType type; + + int noisy_offset; + int denoised_offset; + + int num_components; + bool use_compositing; + bool use_denoising_albedo; +}; + +bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task) +{ + const CUDAContextScope scope(this); + + DenoiseContext context(this, task); + + if (!denoise_ensure(context)) { + return false; + } + + if (!denoise_filter_guiding_preprocess(context)) { + LOG(ERROR) << "Error preprocessing guiding passes."; + return false; + } + + /* Passes which will use real albedo when it is available. */ + denoise_pass(context, PASS_COMBINED); + denoise_pass(context, PASS_SHADOW_CATCHER_MATTE); + + /* Passes which do not need albedo and hence if real is present it needs to become fake. */ + denoise_pass(context, PASS_SHADOW_CATCHER); + + return true; +} + +DeviceQueue *OptiXDevice::get_denoise_queue() +{ + return &denoiser_.queue; +} + +bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), + const_cast<int *>(&context.guiding_params.pass_stride), + const_cast<int *>(&context.guiding_params.pass_albedo), + const_cast<int *>(&context.guiding_params.pass_normal), + &context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&context.pass_sample_count), + const_cast<int *>(&context.pass_denoising_albedo), + const_cast<int *>(&context.pass_denoising_normal), + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&context.num_samples)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer), + const_cast<int *>(&context.guiding_params.pass_stride), + const_cast<int *>(&context.guiding_params.pass_albedo), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args); +} + +void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type) +{ + const BufferParams &buffer_params = context.buffer_params; + + const DenoisePass pass(pass_type, buffer_params); + + if (pass.noisy_offset == PASS_UNUSED) { + return; + } + if (pass.denoised_offset == PASS_UNUSED) { + LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type); + return; + } + + if (pass.use_denoising_albedo) { + if (context.albedo_replaced_with_fake) { + LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set."; + return; + } + } + else if (!context.albedo_replaced_with_fake) { + context.albedo_replaced_with_fake = true; + if (!denoise_filter_guiding_set_fake_albedo(context)) { + LOG(ERROR) << "Error replacing real albedo with the fake one."; + return; + } + } + + /* Read and preprocess noisy color input pass. */ + denoise_color_read(context, pass); + if (!denoise_filter_color_preprocess(context, pass)) { + LOG(ERROR) << "Error connverting denoising passes to RGB buffer."; + return; + } + + if (!denoise_run(context, pass)) { + LOG(ERROR) << "Error running OptiX denoiser."; + return; + } + + /* Store result in the combined pass of the render buffer. + * + * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */ + if (!denoise_filter_color_postprocess(context, pass)) { + LOG(ERROR) << "Error copying denoiser result to the denoised pass."; + return; + } + + denoiser_.queue.synchronize(); +} + +void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass) +{ + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = pass.type; + pass_access_info.mode = PassMode::NOISY; + pass_access_info.offset = pass.noisy_offset; + + /* Denoiser operates on passes which are used to calculate the approximation, and is never used + * on the approximation. The latter is not even possible because OptiX does not support + * denoising of semi-transparent pixels. */ + pass_access_info.use_approximate_shadow_catcher = false; + pass_access_info.use_approximate_shadow_catcher_background = false; + pass_access_info.show_active_pixels = false; + + /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases. + */ + const PassAccessorGPU pass_accessor( + &denoiser_.queue, pass_access_info, 1.0f, context.num_samples); + + PassAccessor::Destination destination(pass_access_info.type); + destination.d_pixels = context.render_buffers->buffer.device_pointer + + pass.denoised_offset * sizeof(float); + destination.num_components = 3; + destination.pixel_stride = context.buffer_params.pass_stride; + + pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination); +} + +bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {&context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&pass.denoised_offset)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context, + const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + + const int work_size = buffer_params.width * buffer_params.height; + + void *args[] = {&context.render_buffers->buffer.device_pointer, + const_cast<int *>(&buffer_params.full_x), + const_cast<int *>(&buffer_params.full_y), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.height), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&buffer_params.pass_stride), + const_cast<int *>(&context.num_samples), + const_cast<int *>(&pass.noisy_offset), + const_cast<int *>(&pass.denoised_offset), + const_cast<int *>(&context.pass_sample_count), + const_cast<int *>(&pass.num_components), + const_cast<bool *>(&pass.use_compositing)}; + + return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args); +} + +bool OptiXDevice::denoise_ensure(DenoiseContext &context) +{ + if (!denoise_create_if_needed(context)) { + LOG(ERROR) << "OptiX denoiser creation has failed."; + return false; + } + + if (!denoise_configure_if_needed(context)) { + LOG(ERROR) << "OptiX denoiser configuration has failed."; + return false; + } + + return true; +} + +bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context) +{ + const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) || + (denoiser_.use_pass_albedo != context.use_pass_albedo) || + (denoiser_.use_pass_normal != context.use_pass_normal); + if (!recreate_denoiser) { + return true; + } + + /* Destroy existing handle before creating new one. */ + if (denoiser_.optix_denoiser) { + optixDenoiserDestroy(denoiser_.optix_denoiser); + } + + /* Create OptiX denoiser handle on demand when it is first used. */ + OptixDenoiserOptions denoiser_options = {}; + denoiser_options.guideAlbedo = context.use_pass_albedo; + denoiser_options.guideNormal = context.use_pass_normal; + const OptixResult result = optixDenoiserCreate( + this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser); + + if (result != OPTIX_SUCCESS) { + set_error("Failed to create OptiX denoiser"); + return false; + } + + /* OptiX denoiser handle was created with the requested number of input passes. */ + denoiser_.use_pass_albedo = context.use_pass_albedo; + denoiser_.use_pass_normal = context.use_pass_normal; + + /* OptiX denoiser has been created, but it needs configuration. */ + denoiser_.is_configured = false; + + return true; +} + +bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context) +{ + if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width && + denoiser_.configured_size.y == context.buffer_params.height)) { + return true; + } + + const BufferParams &buffer_params = context.buffer_params; + + OptixDenoiserSizes sizes = {}; + optix_assert(optixDenoiserComputeMemoryResources( + denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes)); + + denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes; + denoiser_.scratch_offset = sizes.stateSizeInBytes; + + /* Allocate denoiser state if tile size has changed since last setup. */ + denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size); + + /* Initialize denoiser state for the current tile size. */ + const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser, + denoiser_.queue.stream(), + buffer_params.width, + buffer_params.height, + denoiser_.state.device_pointer, + denoiser_.scratch_offset, + denoiser_.state.device_pointer + + denoiser_.scratch_offset, + denoiser_.scratch_size); + if (result != OPTIX_SUCCESS) { + set_error("Failed to set up OptiX denoiser"); + return false; + } + + denoiser_.is_configured = true; + denoiser_.configured_size.x = buffer_params.width; + denoiser_.configured_size.y = buffer_params.height; + + return true; +} + +bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass) +{ + const BufferParams &buffer_params = context.buffer_params; + const int width = buffer_params.width; + const int height = buffer_params.height; + + /* Set up input and output layer information. */ + OptixImage2D color_layer = {0}; + OptixImage2D albedo_layer = {0}; + OptixImage2D normal_layer = {0}; + + OptixImage2D output_layer = {0}; + + /* Color pass. */ + { + const int pass_denoised = pass.denoised_offset; + const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float); + + color_layer.data = context.render_buffers->buffer.device_pointer + + pass_denoised * sizeof(float); + color_layer.width = width; + color_layer.height = height; + color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride; + color_layer.pixelStrideInBytes = pass_stride_in_bytes; + color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + + device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE); + + /* Optional albedo and color passes. */ + if (context.num_input_passes > 1) { + const device_ptr d_guiding_buffer = context.guiding_params.device_pointer; + const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float); + const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes; + + if (context.use_pass_albedo) { + albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float); + albedo_layer.width = width; + albedo_layer.height = height; + albedo_layer.rowStrideInBytes = row_stride_in_bytes; + albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes; + albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + + if (context.use_pass_normal) { + normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float); + normal_layer.width = width; + normal_layer.height = height; + normal_layer.rowStrideInBytes = row_stride_in_bytes; + normal_layer.pixelStrideInBytes = pixel_stride_in_bytes; + normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3; + } + } + + /* Denoise in-place of the noisy input in the render buffers. */ + output_layer = color_layer; + + /* Finally run denonising. */ + OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */ + OptixDenoiserLayer image_layers = {}; + image_layers.input = color_layer; + image_layers.output = output_layer; + + OptixDenoiserGuideLayer guide_layers = {}; + guide_layers.albedo = albedo_layer; + guide_layers.normal = normal_layer; + + optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser, + denoiser_.queue.stream(), + ¶ms, + denoiser_.state.device_pointer, + denoiser_.scratch_offset, + &guide_layers, + &image_layers, + 1, + 0, + 0, + denoiser_.state.device_pointer + denoiser_.scratch_offset, + denoiser_.scratch_size)); + + return true; +} + +bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, + OptixBuildOperation operation, + const OptixBuildInput &build_input, + uint16_t num_motion_steps) +{ + const CUDAContextScope scope(this); + + const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC); + + /* Compute memory usage. */ + OptixAccelBufferSizes sizes = {}; + OptixAccelBuildOptions options = {}; + options.operation = operation; + if (use_fast_trace_bvh) { + VLOG(2) << "Using fast to trace OptiX BVH"; + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; + } + else { + VLOG(2) << "Using fast to update OptiX BVH"; + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE; + } + + options.motionOptions.numKeys = num_motion_steps; + options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; + options.motionOptions.timeBegin = 0.0f; + options.motionOptions.timeEnd = 1.0f; + + optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); + + /* Allocate required output buffers. */ + device_only_memory<char> temp_mem(this, "optix temp as build mem"); + temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); + if (!temp_mem.device_pointer) { + /* Make sure temporary memory allocation succeeded. */ + return false; + } + + device_only_memory<char> &out_data = bvh->as_data; + if (operation == OPTIX_BUILD_OPERATION_BUILD) { + assert(out_data.device == this); + out_data.alloc_to_device(sizes.outputSizeInBytes); + if (!out_data.device_pointer) { + return false; + } + } + else { + assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes); + } + + /* Finally build the acceleration structure. */ + OptixAccelEmitDesc compacted_size_prop = {}; + compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; + /* A tiny space was allocated for this property at the end of the temporary buffer above. + * Make sure this pointer is 8-byte aligned. */ + compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); + + OptixTraversableHandle out_handle = 0; + optix_assert(optixAccelBuild(context, + NULL, + &options, + &build_input, + 1, + temp_mem.device_pointer, + sizes.tempSizeInBytes, + out_data.device_pointer, + sizes.outputSizeInBytes, + &out_handle, + use_fast_trace_bvh ? &compacted_size_prop : NULL, + use_fast_trace_bvh ? 1 : 0)); + bvh->traversable_handle = static_cast<uint64_t>(out_handle); + + /* Wait for all operations to finish. */ + cuda_assert(cuStreamSynchronize(NULL)); + + /* Compact acceleration structure to save memory (do not do this in viewport for faster builds). + */ + if (use_fast_trace_bvh) { + uint64_t compacted_size = sizes.outputSizeInBytes; + cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); + + /* Temporary memory is no longer needed, so free it now to make space. */ + temp_mem.free(); + + /* There is no point compacting if the size does not change. */ + if (compacted_size < sizes.outputSizeInBytes) { + device_only_memory<char> compacted_data(this, "optix compacted as"); + compacted_data.alloc_to_device(compacted_size); + if (!compacted_data.device_pointer) + /* Do not compact if memory allocation for compacted acceleration structure fails. + * Can just use the uncompacted one then, so succeed here regardless. */ + return !have_error(); + + optix_assert(optixAccelCompact( + context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle)); + bvh->traversable_handle = static_cast<uint64_t>(out_handle); + + /* Wait for compaction to finish. */ + cuda_assert(cuStreamSynchronize(NULL)); + + std::swap(out_data.device_size, compacted_data.device_size); + std::swap(out_data.device_pointer, compacted_data.device_pointer); + } + } + + return !have_error(); +} + +void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) +{ + const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC); + + free_bvh_memory_delayed(); + + BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); + + progress.set_substatus("Building OptiX acceleration structure"); + + if (!bvh->params.top_level) { + assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1); + + /* Refit is only possible in viewport for now (because AS is built with + * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */ + OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD; + if (refit && !use_fast_trace_bvh) { + assert(bvh_optix->traversable_handle != 0); + operation = OPTIX_BUILD_OPERATION_UPDATE; + } + else { + bvh_optix->as_data.free(); + bvh_optix->traversable_handle = 0; + } + + /* Build bottom level acceleration structures (BLAS). */ + Geometry *const geom = bvh->geometry[0]; + if (geom->geometry_type == Geometry::HAIR) { + /* Build BLAS for curve primitives. */ + Hair *const hair = static_cast<Hair *const>(geom); + if (hair->num_curves() == 0) { + return; + } + + const size_t num_segments = hair->num_segments(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && hair->get_use_motion_blur() && motion_keys) { + num_motion_steps = hair->get_motion_steps(); + } + + device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY); + device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); + device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); + /* Four control points for each curve segment. */ + const size_t num_vertices = num_segments * 4; + if (hair->curve_shape == CURVE_THICK) { + index_data.alloc(num_segments); + vertex_data.alloc(num_vertices * num_motion_steps); + } + else + aabb_data.alloc(num_segments * num_motion_steps); + + /* Get AABBs for each motion step. */ + for (size_t step = 0; step < num_motion_steps; ++step) { + /* The center step for motion vertices is not stored in the attribute. */ + const float3 *keys = hair->get_curve_keys().data(); + size_t center_step = (num_motion_steps - 1) / 2; + if (step != center_step) { + size_t attr_offset = (step > center_step) ? step - 1 : step; + /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */ + keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size(); + } + + for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { + const Hair::Curve curve = hair->get_curve(j); + const array<float> &curve_radius = hair->get_curve_radius(); + + for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { + if (hair->curve_shape == CURVE_THICK) { + int k0 = curve.first_key + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, curve.first_key); + int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); + + const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); + const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); + const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); + const float4 pw = make_float4( + curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]); + + /* Convert Catmull-Rom data to Bezier spline. */ + static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; + static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; + static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; + static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; + + index_data[i] = i * 4; + float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; + v[0] = make_float4( + dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); + v[1] = make_float4( + dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); + v[2] = make_float4( + dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); + v[3] = make_float4( + dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); + } + else { + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds); + + const size_t index = step * num_segments + i; + aabb_data[index].minX = bounds.min.x; + aabb_data[index].minY = bounds.min.y; + aabb_data[index].minZ = bounds.min.z; + aabb_data[index].maxX = bounds.max.x; + aabb_data[index].maxY = bounds.max.y; + aabb_data[index].maxZ = bounds.max.z; + } + } + } + } + + /* Upload AABB data to GPU. */ + aabb_data.copy_to_device(); + index_data.copy_to_device(); + vertex_data.copy_to_device(); + + vector<device_ptr> aabb_ptrs; + aabb_ptrs.reserve(num_motion_steps); + vector<device_ptr> width_ptrs; + vector<device_ptr> vertex_ptrs; + width_ptrs.reserve(num_motion_steps); + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); + const device_ptr base_ptr = vertex_data.device_pointer + + step * num_vertices * sizeof(float4); + width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */ + vertex_ptrs.push_back(base_ptr); + } + + /* Force a single any-hit call, so shadow record-all behavior works correctly. */ + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; + OptixBuildInput build_input = {}; + if (hair->curve_shape == CURVE_THICK) { + build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; + build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + build_input.curveArray.numPrimitives = num_segments; + build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.curveArray.numVertices = num_vertices; + build_input.curveArray.vertexStrideInBytes = sizeof(float4); + build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); + build_input.curveArray.widthStrideInBytes = sizeof(float4); + build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; + build_input.curveArray.indexStrideInBytes = sizeof(int); + build_input.curveArray.flag = build_flags; + build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; + } + else { + /* Disable visibility test any-hit program, since it is already checked during + * intersection. Those trace calls that require anyhit can force it with a ray flag. */ + build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; + + build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; + build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.customPrimitiveArray.numPrimitives = num_segments; + build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); + build_input.customPrimitiveArray.flags = &build_flags; + build_input.customPrimitiveArray.numSbtRecords = 1; + build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; + } + + if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + } + else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) { + /* Build BLAS for triangle primitives. */ + Mesh *const mesh = static_cast<Mesh *const>(geom); + if (mesh->num_triangles() == 0) { + return; + } + + const size_t num_verts = mesh->get_verts().size(); + + size_t num_motion_steps = 1; + Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if (motion_blur && mesh->get_use_motion_blur() && motion_keys) { + num_motion_steps = mesh->get_motion_steps(); + } + + device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY); + index_data.alloc(mesh->get_triangles().size()); + memcpy(index_data.data(), + mesh->get_triangles().data(), + mesh->get_triangles().size() * sizeof(int)); + device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY); + vertex_data.alloc(num_verts * num_motion_steps); + + for (size_t step = 0; step < num_motion_steps; ++step) { + const float3 *verts = mesh->get_verts().data(); + + size_t center_step = (num_motion_steps - 1) / 2; + /* The center step for motion vertices is not stored in the attribute. */ + if (step != center_step) { + verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts; + } + + memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3)); + } + + /* Upload triangle data to GPU. */ + index_data.copy_to_device(); + vertex_data.copy_to_device(); + + vector<device_ptr> vertex_ptrs; + vertex_ptrs.reserve(num_motion_steps); + for (size_t step = 0; step < num_motion_steps; ++step) { + vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); + } + + /* Force a single any-hit call, so shadow record-all behavior works correctly. */ + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; + build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.triangleArray.numVertices = num_verts; + build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3; + build_input.triangleArray.vertexStrideInBytes = sizeof(float4); + build_input.triangleArray.indexBuffer = index_data.device_pointer; + build_input.triangleArray.numIndexTriplets = mesh->num_triangles(); + build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3; + build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int); + build_input.triangleArray.flags = &build_flags; + /* The SBT does not store per primitive data since Cycles already allocates separate + * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in + * one and rely on that having the same meaning in this case. */ + build_input.triangleArray.numSbtRecords = 1; + build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset; + + if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + } + } + else { + unsigned int num_instances = 0; + unsigned int max_num_instances = 0xFFFFFFFF; + + bvh_optix->as_data.free(); + bvh_optix->traversable_handle = 0; + bvh_optix->motion_transform_data.free(); + + optixDeviceContextGetProperty(context, + OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, + &max_num_instances, + sizeof(max_num_instances)); + /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */ + max_num_instances >>= 1; + if (bvh->objects.size() > max_num_instances) { + progress.set_error( + "Failed to build OptiX acceleration structure because there are too many instances"); + return; + } + + /* Fill instance descriptions. */ + device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY); + instances.alloc(bvh->objects.size()); + + /* Calculate total motion transform size and allocate memory for them. */ + size_t motion_transform_offset = 0; + if (motion_blur) { + size_t total_motion_transform_size = 0; + for (Object *const ob : bvh->objects) { + if (ob->is_traceable() && ob->use_motion()) { + total_motion_transform_size = align_up(total_motion_transform_size, + OPTIX_TRANSFORM_BYTE_ALIGNMENT); + const size_t motion_keys = max(ob->get_motion().size(), 2) - 2; + total_motion_transform_size = total_motion_transform_size + + sizeof(OptixSRTMotionTransform) + + motion_keys * sizeof(OptixSRTData); + } + } + + assert(bvh_optix->motion_transform_data.device == this); + bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); + } + + for (Object *ob : bvh->objects) { + /* Skip non-traceable objects. */ + if (!ob->is_traceable()) { + continue; + } + + BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh); + OptixTraversableHandle handle = blas->traversable_handle; + + OptixInstance &instance = instances[num_instances++]; + memset(&instance, 0, sizeof(instance)); + + /* Clear transform to identity matrix. */ + instance.transform[0] = 1.0f; + instance.transform[5] = 1.0f; + instance.transform[10] = 1.0f; + + /* Set user instance ID to object index (but leave low bit blank). */ + instance.instanceId = ob->get_device_index() << 1; + + /* Have to have at least one bit in the mask, or else instance would always be culled. */ + instance.visibilityMask = 1; + + if (ob->get_geometry()->has_volume) { + /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes. + */ + instance.visibilityMask |= 2; + } + + if (ob->get_geometry()->geometry_type == Geometry::HAIR) { + /* Same applies to curves (so they can be skipped in local trace calls). */ + instance.visibilityMask |= 4; + + if (motion_blur && ob->get_geometry()->has_motion_blur() && + static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { + /* Select between motion blur and non-motion blur built-in intersection module. */ + instance.sbtOffset = PG_HITD_MOTION - PG_HITD; + } + } + + /* Insert motion traversable if object has motion. */ + if (motion_blur && ob->use_motion()) { + size_t motion_keys = max(ob->get_motion().size(), 2) - 2; + size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + + motion_keys * sizeof(OptixSRTData); + + const CUDAContextScope scope(this); + + motion_transform_offset = align_up(motion_transform_offset, + OPTIX_TRANSFORM_BYTE_ALIGNMENT); + CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + + motion_transform_offset; + motion_transform_offset += motion_transform_size; + + /* Allocate host side memory for motion transform and fill it with transform data. */ + OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( + new uint8_t[motion_transform_size]); + motion_transform.child = handle; + motion_transform.motionOptions.numKeys = ob->get_motion().size(); + motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; + motion_transform.motionOptions.timeBegin = 0.0f; + motion_transform.motionOptions.timeEnd = 1.0f; + + OptixSRTData *const srt_data = motion_transform.srtData; + array<DecomposedTransform> decomp(ob->get_motion().size()); + transform_motion_decompose( + decomp.data(), ob->get_motion().data(), ob->get_motion().size()); + + for (size_t i = 0; i < ob->get_motion().size(); ++i) { + /* Scale. */ + srt_data[i].sx = decomp[i].y.w; /* scale.x.x */ + srt_data[i].sy = decomp[i].z.w; /* scale.y.y */ + srt_data[i].sz = decomp[i].w.w; /* scale.z.z */ + + /* Shear. */ + srt_data[i].a = decomp[i].z.x; /* scale.x.y */ + srt_data[i].b = decomp[i].z.y; /* scale.x.z */ + srt_data[i].c = decomp[i].w.x; /* scale.y.z */ + assert(decomp[i].z.z == 0.0f); /* scale.y.x */ + assert(decomp[i].w.y == 0.0f); /* scale.z.x */ + assert(decomp[i].w.z == 0.0f); /* scale.z.y */ + + /* Pivot point. */ + srt_data[i].pvx = 0.0f; + srt_data[i].pvy = 0.0f; + srt_data[i].pvz = 0.0f; + + /* Rotation. */ + srt_data[i].qx = decomp[i].x.x; + srt_data[i].qy = decomp[i].x.y; + srt_data[i].qz = decomp[i].x.z; + srt_data[i].qw = decomp[i].x.w; + + /* Translation. */ + srt_data[i].tx = decomp[i].y.x; + srt_data[i].ty = decomp[i].y.y; + srt_data[i].tz = decomp[i].y.z; + } + + /* Upload motion transform to GPU. */ + cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); + delete[] reinterpret_cast<uint8_t *>(&motion_transform); + + /* Disable instance transform if object uses motion transform already. */ + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + + /* Get traversable handle to motion transform. */ + optixConvertPointerToTraversableHandle(context, + motion_transform_gpu, + OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, + &instance.traversableHandle); + } + else { + instance.traversableHandle = handle; + + if (ob->get_geometry()->is_instanced()) { + /* Set transform matrix. */ + memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); + } + else { + /* Disable instance transform if geometry already has it applied to vertex data. */ + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + /* Non-instanced objects read ID from 'prim_object', so distinguish + * them from instanced objects with the low bit set. */ + instance.instanceId |= 1; + } + } + } + + /* Upload instance descriptions. */ + instances.resize(num_instances); + instances.copy_to_device(); + + /* Build top-level acceleration structure (TLAS) */ + OptixBuildInput build_input = {}; + build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES; + build_input.instanceArray.instances = instances.device_pointer; + build_input.instanceArray.numInstances = num_instances; + + if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) { + progress.set_error("Failed to build OptiX acceleration structure"); + } + tlas_handle = bvh_optix->traversable_handle; + } +} + +void OptiXDevice::release_optix_bvh(BVH *bvh) +{ + thread_scoped_lock lock(delayed_free_bvh_mutex); + /* Do delayed free of BVH memory, since geometry holding BVH might be deleted + * while GPU is still rendering. */ + BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh); + + delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data)); + delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data)); + bvh_optix->traversable_handle = 0; +} + +void OptiXDevice::free_bvh_memory_delayed() +{ + thread_scoped_lock lock(delayed_free_bvh_mutex); + delayed_free_bvh_memory.free_memory(); +} + +void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) +{ + /* Set constant memory for CUDA module. */ + CUDADevice::const_copy_to(name, host, size); + + if (strcmp(name, "__data") == 0) { + assert(size <= sizeof(KernelData)); + + /* Update traversable handle (since it is different for each device on multi devices). */ + KernelData *const data = (KernelData *)host; + *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; + + update_launch_params(offsetof(KernelParamsOptiX, data), host, size); + return; + } + + /* Update data storage pointers in launch parameters. */ +# define KERNEL_TEX(data_type, tex_name) \ + if (strcmp(name, #tex_name) == 0) { \ + update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \ + return; \ + } + KERNEL_TEX(IntegratorStateGPU, __integrator_state) +# include "kernel/kernel_textures.h" +# undef KERNEL_TEX +} + +void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size) +{ + const CUDAContextScope scope(this); + + cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size)); +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h new file mode 100644 index 00000000000..742ae0f1bab --- /dev/null +++ b/intern/cycles/device/optix/device_impl.h @@ -0,0 +1,186 @@ +/* + * Copyright 2019, NVIDIA Corporation. + * Copyright 2019, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/device_impl.h" +# include "device/optix/queue.h" +# include "device/optix/util.h" +# include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +class BVHOptiX; +struct KernelParamsOptiX; + +/* List of OptiX program groups. */ +enum { + PG_RGEN_INTERSECT_CLOSEST, + PG_RGEN_INTERSECT_SHADOW, + PG_RGEN_INTERSECT_SUBSURFACE, + PG_RGEN_INTERSECT_VOLUME_STACK, + PG_RGEN_SHADE_SURFACE_RAYTRACE, + PG_MISS, + PG_HITD, /* Default hit group. */ + PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ + PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */ + PG_HITD_MOTION, + PG_HITS_MOTION, + PG_CALL_SVM_AO, + PG_CALL_SVM_BEVEL, + PG_CALL_AO_PASS, + NUM_PROGRAM_GROUPS +}; + +static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS; +static const int NUM_MIS_PROGRAM_GROUPS = 1; +static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD; +static const int NUM_HIT_PROGRAM_GROUPS = 5; +static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; +static const int NUM_CALLABLE_PROGRAM_GROUPS = 3; + +/* List of OptiX pipelines. */ +enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES }; + +/* A single shader binding table entry. */ +struct SbtRecord { + char header[OPTIX_SBT_RECORD_HEADER_SIZE]; +}; + +class OptiXDevice : public CUDADevice { + public: + OptixDeviceContext context = NULL; + + OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */ + OptixModule builtin_modules[2] = {}; + OptixPipeline pipelines[NUM_PIPELINES] = {}; + + bool motion_blur = false; + device_vector<SbtRecord> sbt_data; + device_only_memory<KernelParamsOptiX> launch_params; + OptixTraversableHandle tlas_handle = 0; + + vector<device_only_memory<char>> delayed_free_bvh_memory; + thread_mutex delayed_free_bvh_mutex; + + class Denoiser { + public: + explicit Denoiser(OptiXDevice *device); + ~Denoiser(); + + OptiXDevice *device; + OptiXDeviceQueue queue; + + OptixDenoiser optix_denoiser = nullptr; + + /* Configuration size, as provided to `optixDenoiserSetup`. + * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the + * `is_configured` will be false. */ + bool is_configured = false; + int2 configured_size = make_int2(0, 0); + + /* OptiX denoiser state and scratch buffers, stored in a single memory buffer. + * The memory layout goes as following: [denoiser state][scratch buffer]. */ + device_only_memory<unsigned char> state; + size_t scratch_offset = 0; + size_t scratch_size = 0; + + bool use_pass_albedo = false; + bool use_pass_normal = false; + }; + Denoiser denoiser_; + + public: + OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + ~OptiXDevice(); + + private: + BVHLayoutMask get_bvh_layout_mask() const override; + + string compile_kernel_get_common_cflags(const uint kernel_features) override; + + bool load_kernels(const uint kernel_features) override; + + bool build_optix_bvh(BVHOptiX *bvh, + OptixBuildOperation operation, + const OptixBuildInput &build_input, + uint16_t num_motion_steps); + + void build_bvh(BVH *bvh, Progress &progress, bool refit) override; + + void release_optix_bvh(BVH *bvh) override; + void free_bvh_memory_delayed(); + + void const_copy_to(const char *name, void *host, size_t size) override; + + void update_launch_params(size_t offset, void *data, size_t data_size); + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + /* -------------------------------------------------------------------- + * Denoising. + */ + + class DenoiseContext; + class DenoisePass; + + virtual bool denoise_buffer(const DeviceDenoiseTask &task) override; + virtual DeviceQueue *get_denoise_queue() override; + + /* Read guiding passes from the render buffers, preprocess them in a way which is expected by + * OptiX and store in the guiding passes memory within the given context. + * + * Pre=-processing of the guiding passes is to only hapopen once per context lifetime. DO not + * preprocess them for every pass which is being denoised. */ + bool denoise_filter_guiding_preprocess(DenoiseContext &context); + + /* Set fake albedo pixels in the albedo guiding pass storage. + * After this point only passes which do not need albedo for denoising can be processed. */ + bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context); + + void denoise_pass(DenoiseContext &context, PassType pass_type); + + /* Read input color pass from the render buffer into the memory which corresponds to the noisy + * input within the given context. Pixels are scaled to the number of samples, but are not + * preprocessed yet. */ + void denoise_color_read(DenoiseContext &context, const DenoisePass &pass); + + /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the + * denoiser result to the render buffer. */ + bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass); + bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass); + + /* Make sure the OptiX denoiser is created and configured. */ + bool denoise_ensure(DenoiseContext &context); + + /* Create OptiX denoiser descriptor if needed. + * Will do nothing if the current OptiX descriptor is usable for the given parameters. + * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */ + bool denoise_create_if_needed(DenoiseContext &context); + + /* Configure existing OptiX denoiser descriptor for the use for the given task. */ + bool denoise_configure_if_needed(DenoiseContext &context); + + /* Run configured denoiser. */ + bool denoise_run(DenoiseContext &context, const DenoisePass &pass); +}; + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp new file mode 100644 index 00000000000..458ed70baa8 --- /dev/null +++ b/intern/cycles/device/optix/queue.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPTIX + +# include "device/optix/queue.h" +# include "device/optix/device_impl.h" + +# include "util/util_time.h" + +# undef __KERNEL_CPU__ +# define __KERNEL_OPTIX__ +# include "kernel/device/optix/globals.h" + +CCL_NAMESPACE_BEGIN + +/* CUDADeviceQueue */ + +OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device) +{ +} + +void OptiXDeviceQueue::init_execution() +{ + CUDADeviceQueue::init_execution(); +} + +static bool is_optix_specific_kernel(DeviceKernel kernel) +{ + return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); +} + +bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (!is_optix_specific_kernel(kernel)) { + return CUDADeviceQueue::enqueue(kernel, work_size, args); + } + + if (cuda_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const CUDAContextScope scope(cuda_device_); + + OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_); + + const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer; + const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer; + + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array), + args[0], // &d_path_index + sizeof(device_ptr), + cuda_stream_)); + + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), + args[1], // &d_render_buffer + sizeof(device_ptr), + cuda_stream_)); + } + + cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + + OptixPipeline pipeline = nullptr; + OptixShaderBindingTable sbt_params = {}; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + pipeline = optix_device->pipelines[PIP_INTERSECT]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord); + break; + + default: + LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel) + << " is attempted to be enqueued."; + return false; + } + + sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord); + sbt_params.missRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS; + sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord); + sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); + sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS; + sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord); + sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS; + sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord); + + /* Launch the ray generation program. */ + optix_device_assert(optix_device, + optixLaunch(pipeline, + cuda_stream_, + launch_params_ptr, + optix_device->launch_params.data_elements, + &sbt_params, + work_size, + 1, + 1)); + + return !(optix_device->have_error()); +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h new file mode 100644 index 00000000000..0de422ccc71 --- /dev/null +++ b/intern/cycles/device/optix/queue.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/queue.h" + +CCL_NAMESPACE_BEGIN + +class OptiXDevice; + +/* Base class for CUDA queues. */ +class OptiXDeviceQueue : public CUDADeviceQueue { + public: + OptiXDeviceQueue(OptiXDevice *device); + + virtual void init_execution() override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h new file mode 100644 index 00000000000..34ae5bb5609 --- /dev/null +++ b/intern/cycles/device/optix/util.h @@ -0,0 +1,45 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_OPTIX + +# include "device/cuda/util.h" + +# ifdef WITH_CUDA_DYNLOAD +# include <cuew.h> +// Do not use CUDA SDK headers when using CUEW +# define OPTIX_DONT_INCLUDE_CUDA +# endif + +# include <optix_stubs.h> + +/* Utility for checking return values of OptiX function calls. */ +# define optix_device_assert(optix_device, stmt) \ + { \ + OptixResult result = stmt; \ + if (result != OPTIX_SUCCESS) { \ + const char *name = optixGetErrorName(result); \ + optix_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define optix_assert(stmt) optix_device_assert(this, stmt) + +#endif /* WITH_OPTIX */ diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp index 57f25283f85..8294e716ebe 100644 --- a/intern/cycles/graph/node.cpp +++ b/intern/cycles/graph/node.cpp @@ -814,7 +814,7 @@ bool Node::socket_is_modified(const SocketType &input) const return (socket_modified & input.modified_flag_bit) != 0; } -bool Node::is_modified() +bool Node::is_modified() const { return socket_modified != 0; } diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h index aa365baeccd..8f27a82d37b 100644 --- a/intern/cycles/graph/node.h +++ b/intern/cycles/graph/node.h @@ -16,6 +16,8 @@ #pragma once +#include <type_traits> + #include "graph/node_type.h" #include "util/util_array.h" @@ -34,7 +36,10 @@ struct Transform; #define NODE_SOCKET_API_BASE_METHODS(type_, name, string_name) \ const SocketType *get_##name##_socket() const \ { \ - static const SocketType *socket = type->find_input(ustring(string_name)); \ + /* Explicitly cast to base class to use `Node::type` even if the derived class defines \ + * `type`. */ \ + const Node *self_node = this; \ + static const SocketType *socket = self_node->type->find_input(ustring(string_name)); \ return socket; \ } \ bool name##_is_modified() const \ @@ -111,6 +116,15 @@ struct Node { void set(const SocketType &input, const Transform &value); void set(const SocketType &input, Node *value); + /* Implicitly cast enums and enum classes to integer, which matches an internal way of how + * enumerator values are stored and accessed in a generic API. */ + template<class ValueType, typename std::enable_if_t<std::is_enum_v<ValueType>> * = nullptr> + void set(const SocketType &input, const ValueType &value) + { + static_assert(sizeof(ValueType) <= sizeof(int), "Enumerator type should fit int"); + set(input, static_cast<int>(value)); + } + /* set array values. the memory from the input array will taken over * by the node and the input array will be empty after return */ void set(const SocketType &input, array<bool> &value); @@ -164,7 +178,7 @@ struct Node { bool socket_is_modified(const SocketType &input) const; - bool is_modified(); + bool is_modified() const; void tag_modified(); void clear_modified(); diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt new file mode 100644 index 00000000000..bfabd35d7c3 --- /dev/null +++ b/intern/cycles/integrator/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright 2011-2021 Blender Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(INC + .. +) + +set(SRC + adaptive_sampling.cpp + denoiser.cpp + denoiser_device.cpp + denoiser_oidn.cpp + denoiser_optix.cpp + path_trace.cpp + tile.cpp + pass_accessor.cpp + pass_accessor_cpu.cpp + pass_accessor_gpu.cpp + path_trace_work.cpp + path_trace_work_cpu.cpp + path_trace_work_gpu.cpp + render_scheduler.cpp + shader_eval.cpp + work_balancer.cpp + work_tile_scheduler.cpp +) + +set(SRC_HEADERS + adaptive_sampling.h + denoiser.h + denoiser_device.h + denoiser_oidn.h + denoiser_optix.h + path_trace.h + tile.h + pass_accessor.h + pass_accessor_cpu.h + pass_accessor_gpu.h + path_trace_work.h + path_trace_work_cpu.h + path_trace_work_gpu.h + render_scheduler.h + shader_eval.h + work_balancer.h + work_tile_scheduler.h +) + +set(LIB + # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to + # avoid such cyclic dependency. + cycles_render + + cycles_util +) + +if(WITH_OPENIMAGEDENOISE) + list(APPEND LIB + ${OPENIMAGEDENOISE_LIBRARIES} + ) +endif() + +include_directories(${INC}) +include_directories(SYSTEM ${INC_SYS}) + +cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp new file mode 100644 index 00000000000..23fbcfea5c2 --- /dev/null +++ b/intern/cycles/integrator/adaptive_sampling.cpp @@ -0,0 +1,71 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/adaptive_sampling.h" + +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +AdaptiveSampling::AdaptiveSampling() +{ +} + +int AdaptiveSampling::align_samples(int start_sample, int num_samples) const +{ + if (!use) { + return num_samples; + } + + /* + * The naive implementation goes as following: + * + * int count = 1; + * while (!need_filter(start_sample + count - 1) && count < num_samples) { + * ++count; + * } + * return count; + */ + + /* 0-based sample index at which first filtering will happen. */ + const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1); + + /* Allow as many samples as possible until the first filter sample. */ + if (start_sample + num_samples <= first_filter_sample) { + return num_samples; + } + + const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1)); + + const int num_samples_until_filter = next_filter_sample - start_sample + 1; + + return min(num_samples_until_filter, num_samples); +} + +bool AdaptiveSampling::need_filter(int sample) const +{ + if (!use) { + return false; + } + + if (sample <= min_samples) { + return false; + } + + return (sample & (adaptive_step - 1)) == (adaptive_step - 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h new file mode 100644 index 00000000000..d98edd9894c --- /dev/null +++ b/intern/cycles/integrator/adaptive_sampling.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +class AdaptiveSampling { + public: + AdaptiveSampling(); + + /* Align number of samples so that they align with the adaptive filtering. + * + * Returns the new value for the `num_samples` so that after rendering so many samples on top + * of `start_sample` filtering is required. + * + * The alignment happens in a way that allows to render as many samples as possible without + * missing any filtering point. This means that the result is "clamped" by the nearest sample + * at which filtering is needed. This is part of mechanism which ensures that all devices will + * perform same exact filtering and adaptive sampling, regardless of their performance. + * + * `start_sample` is the 0-based index of sample. + * + * NOTE: The start sample is included into the number of samples to render. This means that + * if the number of samples is 1, then the path tracer will render samples [align_samples], + * if the number of samples is 2, then the path tracer will render samples [align_samples, + * align_samples + 1] and so on. */ + int align_samples(int start_sample, int num_samples) const; + + /* Check whether adaptive sampling filter should happen at this sample. + * Returns false if the adaptive sampling is not use. + * + * `sample` is the 0-based index of sample. */ + bool need_filter(int sample) const; + + bool use = false; + int adaptive_step = 0; + int min_samples = 0; + float threshold = 0.0f; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp new file mode 100644 index 00000000000..598bbd497a5 --- /dev/null +++ b/intern/cycles/integrator/denoiser.cpp @@ -0,0 +1,204 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser.h" + +#include "device/device.h" +#include "integrator/denoiser_oidn.h" +#include "integrator/denoiser_optix.h" +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams ¶ms) +{ + DCHECK(params.use); + + switch (params.type) { + case DENOISER_OPTIX: + return make_unique<OptiXDenoiser>(path_trace_device, params); + + case DENOISER_OPENIMAGEDENOISE: + return make_unique<OIDNDenoiser>(path_trace_device, params); + + case DENOISER_NUM: + case DENOISER_NONE: + case DENOISER_ALL: + /* pass */ + break; + } + + LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen."; + + return nullptr; +} + +Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : path_trace_device_(path_trace_device), params_(params) +{ + DCHECK(params.use); +} + +void Denoiser::set_params(const DenoiseParams ¶ms) +{ + DCHECK_EQ(params.type, params_.type); + + if (params.type == params_.type) { + params_ = params; + } + else { + LOG(ERROR) << "Attempt to change denoiser type."; + } +} + +const DenoiseParams &Denoiser::get_params() const +{ + return params_; +} + +bool Denoiser::load_kernels(Progress *progress) +{ + const Device *denoiser_device = ensure_denoiser_device(progress); + + if (!denoiser_device) { + path_trace_device_->set_error("No device available to denoise on"); + return false; + } + + VLOG(3) << "Will denoise on " << denoiser_device->info.description << " (" + << denoiser_device->info.id << ")"; + + return true; +} + +Device *Denoiser::get_denoiser_device() const +{ + return denoiser_device_; +} + +/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */ +static bool is_single_supported_device(Device *device, DenoiserType type) +{ + if (device->info.type == DEVICE_MULTI) { + /* Assume multi-device is never created with a single sub-device. + * If one requests such configuration it should be checked on the session level. */ + return false; + } + + if (!device->info.multi_devices.empty()) { + /* Some configurations will use multi_devices, but keep the type of an individual device. + * This does simplify checks for homogenous setups, but here we really need a single device. */ + return false; + } + + /* Check the denoiser type is supported. */ + return (device->info.denoisers & type); +} + +/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of + * multi-device. + * + * If there is no device available which supports given denoiser type nullptr is returned. */ +static Device *find_best_device(Device *device, DenoiserType type) +{ + Device *best_device = nullptr; + + device->foreach_device([&](Device *sub_device) { + if ((sub_device->info.denoisers & type) == 0) { + return; + } + if (!best_device) { + best_device = sub_device; + } + else { + /* TODO(sergey): Choose fastest device from available ones. Taking into account performance + * of the device and data transfer cost. */ + } + }); + + return best_device; +} + +static unique_ptr<Device> create_denoiser_device(Device *path_trace_device, + const uint device_type_mask) +{ + const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask); + if (device_infos.empty()) { + return nullptr; + } + + /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on + * a physical CUDA device which is already used for rendering. */ + + /* TODO(sergey): Choose fastest device for denoising. */ + + const DeviceInfo denoiser_device_info = device_infos.front(); + + unique_ptr<Device> denoiser_device( + Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler)); + + if (!denoiser_device) { + return nullptr; + } + + if (denoiser_device->have_error()) { + return nullptr; + } + + /* Only need denoising feature, everything else is unused. */ + if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) { + return nullptr; + } + + return denoiser_device; +} + +Device *Denoiser::ensure_denoiser_device(Progress *progress) +{ + /* The best device has been found already, avoid sequential lookups. + * Additionally, avoid device re-creation if it has failed once. */ + if (denoiser_device_ || device_creation_attempted_) { + return denoiser_device_; + } + + /* Simple case: rendering happens on a single device which also supports denoiser. */ + if (is_single_supported_device(path_trace_device_, params_.type)) { + denoiser_device_ = path_trace_device_; + return denoiser_device_; + } + + /* Find best device from the ones which are already used for rendering. */ + denoiser_device_ = find_best_device(path_trace_device_, params_.type); + if (denoiser_device_) { + return denoiser_device_; + } + + if (progress) { + progress->set_status("Loading denoising kernels (may take a few minutes the first time)"); + } + + device_creation_attempted_ = true; + + const uint device_type_mask = get_device_type_mask(); + local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask); + denoiser_device_ = local_denoiser_device_.get(); + + return denoiser_device_; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h new file mode 100644 index 00000000000..3101b45e31b --- /dev/null +++ b/intern/cycles/integrator/denoiser.h @@ -0,0 +1,135 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the + * better place is figured out. */ + +#include "device/device.h" +#include "device/device_denoise.h" +#include "util/util_function.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; +class Device; +class RenderBuffers; +class Progress; + +/* Implementation of a specific denoising algorithm. + * + * This class takes care of breaking down denosiing algorithm into a series of device calls or to + * calls of an external API to denoise given input. + * + * TODO(sergey): Are we better with device or a queue here? */ +class Denoiser { + public: + /* Create denoiser for the given path trace device. + * + * Notes: + * - The denoiser must be configured. This means that `params.use` must be true. + * This is checked in debug builds. + * - The device might be MultiDevice. */ + static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams ¶ms); + + virtual ~Denoiser() = default; + + void set_params(const DenoiseParams ¶ms); + const DenoiseParams &get_params() const; + + /* Create devices and load kernels needed for denoising. + * The progress is used to communicate state when kenrels actually needs to be loaded. + * + * NOTE: The `progress` is an optional argument, can be nullptr. */ + virtual bool load_kernels(Progress *progress); + + /* Denoise the entire buffer. + * + * Buffer parameters denotes an effective parameters used during rendering. It could be + * a lower resolution render into a bigger allocated buffer, which is used in viewport during + * navigation and non-unit pixel size. Use that instead of render_buffers->params. + * + * The buffer might be copming from a "foreign" device from what this denoise is created for. + * This means that in general case the denoiser will make sure the input data is available on + * the denoiser device, perform denoising, and put data back to the device where the buffer + * came from. + * + * The `num_samples` corresponds to the number of samples in the render buffers. It is used + * to scale buffers down to the "final" value in algorithms which don't do automatic exposure, + * or which needs "final" value for data passes. + * + * The `allow_inplace_modification` means that the denoiser is allowed to do in-place + * modification of the input passes (scaling them down i.e.). This will lower the memory + * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of + * view. + * + * Returns true when all passes are denoised. Will return false if there is a denoiser error (for + * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */ + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) = 0; + + /* Get a device which is used to perform actual denoising. + * + * Notes: + * + * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then, + * + * - The device can be different from the path tracing device. This happens, for example, when + * using OptiX denoiser and rendering on CPU. + * + * - No threading safety is ensured in this call. This means, that it is up to caller to ensure + * that there is no threadingconflict between denoising task lazily initializing the device and + * access to this device happen. */ + Device *get_denoiser_device() const; + + function<bool(void)> is_cancelled_cb; + + bool is_cancelled() const + { + if (!is_cancelled_cb) { + return false; + } + return is_cancelled_cb(); + } + + protected: + Denoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + /* Make sure denoising device is initialized. */ + virtual Device *ensure_denoiser_device(Progress *progress); + + /* Get device type mask which is used to filter available devices when new device needs to be + * created. */ + virtual uint get_device_type_mask() const = 0; + + Device *path_trace_device_; + DenoiseParams params_; + + /* Cached pointer to the device on which denoising will happen. + * Used to avoid lookup of a device for every denoising request. */ + Device *denoiser_device_ = nullptr; + + /* Denoiser device which was created to perform denoising in the case the none of the rendering + * devices are capable of denoising. */ + unique_ptr<Device> local_denoiser_device_; + bool device_creation_attempted_ = false; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp new file mode 100644 index 00000000000..8088cfd7800 --- /dev/null +++ b/intern/cycles/integrator/denoiser_device.cpp @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser_device.h" + +#include "device/device.h" +#include "device/device_denoise.h" +#include "device/device_memory.h" +#include "device/device_queue.h" +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : Denoiser(path_trace_device, params) +{ +} + +DeviceDenoiser::~DeviceDenoiser() +{ + /* Explicit implementation, to allow forward declaration of Device in the header. */ +} + +bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) +{ + Device *denoiser_device = get_denoiser_device(); + if (!denoiser_device) { + return false; + } + + DeviceDenoiseTask task; + task.params = params_; + task.num_samples = num_samples; + task.buffer_params = buffer_params; + task.allow_inplace_modification = allow_inplace_modification; + + RenderBuffers local_render_buffers(denoiser_device); + bool local_buffer_used = false; + + if (denoiser_device == render_buffers->buffer.device) { + /* The device can access an existing buffer pointer. */ + local_buffer_used = false; + task.render_buffers = render_buffers; + } + else { + VLOG(3) << "Creating temporary buffer on denoiser device."; + + DeviceQueue *queue = denoiser_device->get_denoise_queue(); + + /* Create buffer which is available by the device used by denoiser. */ + + /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes, + * ignoring other light ad data passes. */ + + local_buffer_used = true; + + render_buffers->copy_from_device(); + + local_render_buffers.reset(buffer_params); + + /* NOTE: The local buffer is allocated for an exact size of the effective render size, while + * the input render buffer is allcoated for the lowest resolution divider possible. So it is + * important to only copy actually needed part of the input buffer. */ + memcpy(local_render_buffers.buffer.data(), + render_buffers->buffer.data(), + sizeof(float) * local_render_buffers.buffer.size()); + + queue->copy_to_device(local_render_buffers.buffer); + + task.render_buffers = &local_render_buffers; + task.allow_inplace_modification = true; + } + + const bool denoise_result = denoiser_device->denoise_buffer(task); + + if (local_buffer_used) { + local_render_buffers.copy_from_device(); + + render_buffers_host_copy_denoised( + render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params); + + render_buffers->copy_to_device(); + } + + return denoise_result; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h new file mode 100644 index 00000000000..0fd934dba79 --- /dev/null +++ b/intern/cycles/integrator/denoiser_device.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are + * implemented as a part of a driver of specific device. + * + * This implementation makes sure the to-be-denoised buffer is available on the denoising device + * and invoke denoising kernel via device API. */ +class DeviceDenoiser : public Denoiser { + public: + DeviceDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + ~DeviceDenoiser(); + + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp new file mode 100644 index 00000000000..1b5a012ec87 --- /dev/null +++ b/intern/cycles/integrator/denoiser_oidn.cpp @@ -0,0 +1,628 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/denoiser_oidn.h" + +#include <array> + +#include "device/device.h" +#include "device/device_queue.h" +#include "integrator/pass_accessor_cpu.h" +#include "render/buffers.h" +#include "util/util_array.h" +#include "util/util_logging.h" +#include "util/util_openimagedenoise.h" + +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/kernel.h" + +CCL_NAMESPACE_BEGIN + +thread_mutex OIDNDenoiser::mutex_; + +OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : Denoiser(path_trace_device, params) +{ + DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE); + + DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform."; +} + +#ifdef WITH_OPENIMAGEDENOISE +static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/) +{ + OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr); + return !oidn_denoiser->is_cancelled(); +} +#endif + +#ifdef WITH_OPENIMAGEDENOISE + +class OIDNPass { + public: + OIDNPass() = default; + + OIDNPass(const BufferParams &buffer_params, + const char *name, + PassType type, + PassMode mode = PassMode::NOISY) + : name(name), type(type), mode(mode) + { + offset = buffer_params.get_pass_offset(type, mode); + need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL); + + const PassInfo pass_info = Pass::get_info(type); + num_components = pass_info.num_components; + use_compositing = pass_info.use_compositing; + use_denoising_albedo = pass_info.use_denoising_albedo; + } + + inline operator bool() const + { + return name[0] != '\0'; + } + + /* Name of an image which will be passed to the OIDN library. + * Should be one of the following: color, albedo, normal, output. + * The albedo and normal images are optional. */ + const char *name = ""; + + PassType type = PASS_NONE; + PassMode mode = PassMode::NOISY; + int num_components = -1; + bool use_compositing = false; + bool use_denoising_albedo = true; + + /* Offset of beginning of this pass in the render buffers. */ + int offset = -1; + + /* Denotes whether the data is to be scaled down with the number of passes. + * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so + * scaling is not needed for the color pass unless adaptive sampling is used. + * + * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer. + * All the scaling on the output needed for integration with adaptive sampling will happen + * outside of generic pass handling. */ + bool need_scale = false; + + /* The content of the pass has been pre-filtered. */ + bool is_filtered = false; + + /* For the scaled passes, the data which holds values of scaled pixels. */ + array<float> scaled_buffer; +}; + +class OIDNDenoiseContext { + public: + OIDNDenoiseContext(OIDNDenoiser *denoiser, + const DenoiseParams &denoise_params, + const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + const bool allow_inplace_modification) + : denoiser_(denoiser), + denoise_params_(denoise_params), + buffer_params_(buffer_params), + render_buffers_(render_buffers), + num_samples_(num_samples), + allow_inplace_modification_(allow_inplace_modification), + pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT)) + { + if (denoise_params_.use_pass_albedo) { + oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO); + } + + if (denoise_params_.use_pass_normal) { + oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL); + } + } + + bool need_denoising() const + { + if (buffer_params_.width == 0 && buffer_params_.height == 0) { + return false; + } + + return true; + } + + /* Make the guiding passes available by a sequential denoising of various passes. */ + void read_guiding_passes() + { + read_guiding_pass(oidn_albedo_pass_); + read_guiding_pass(oidn_normal_pass_); + } + + void denoise_pass(const PassType pass_type) + { + OIDNPass oidn_color_pass(buffer_params_, "color", pass_type); + if (oidn_color_pass.offset == PASS_UNUSED) { + return; + } + + if (oidn_color_pass.use_denoising_albedo) { + if (albedo_replaced_with_fake_) { + LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set."; + return; + } + } + + OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED); + if (oidn_output_pass.offset == PASS_UNUSED) { + LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type); + return; + } + + OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass); + + oidn::DeviceRef oidn_device = oidn::newDevice(); + oidn_device.commit(); + + /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too. + */ + oidn::FilterRef oidn_filter = oidn_device.newFilter("RT"); + set_input_pass(oidn_filter, oidn_color_access_pass); + set_guiding_passes(oidn_filter, oidn_color_pass); + set_output_pass(oidn_filter, oidn_output_pass); + oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_); + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); + if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE || + denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) { + oidn_filter.set("cleanAux", true); + } + oidn_filter.commit(); + + filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_); + filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_); + + /* Filter the beauty image. */ + oidn_filter.execute(); + + /* Check for errors. */ + const char *error_message; + const oidn::Error error = oidn_device.getError(error_message); + if (error != oidn::Error::None && error != oidn::Error::Cancelled) { + LOG(ERROR) << "OpenImageDenoise error: " << error_message; + } + + postprocess_output(oidn_color_pass, oidn_output_pass); + } + + protected: + void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass) + { + if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass || + oidn_pass.is_filtered) { + return; + } + + oidn::FilterRef oidn_filter = oidn_device.newFilter("RT"); + set_pass(oidn_filter, oidn_pass); + set_output_pass(oidn_filter, oidn_pass); + oidn_filter.commit(); + oidn_filter.execute(); + + oidn_pass.is_filtered = true; + } + + /* Make pixels of a guiding pass available by the denoiser. */ + void read_guiding_pass(OIDNPass &oidn_pass) + { + if (!oidn_pass) { + return; + } + + DCHECK(!oidn_pass.use_compositing); + + if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE && + !is_pass_scale_needed(oidn_pass)) { + /* Pass data is available as-is from the render buffers. */ + return; + } + + if (allow_inplace_modification_) { + scale_pass_in_render_buffers(oidn_pass); + return; + } + + read_pass_pixels_into_buffer(oidn_pass); + } + + /* Special reader of the input pass. + * To save memory it will read pixels into the output, and let the denoiser to perform an + * in-place operation. */ + OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass) + { + const bool use_compositing = oidn_input_pass.use_compositing; + + /* Simple case: no compositing is involved, no scaling is needed. + * The pass pixels will be referenced as-is, without extra processing. */ + if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) { + return oidn_input_pass; + } + + float *buffer_data = render_buffers_->buffer.data(); + float *pass_data = buffer_data + oidn_output_pass.offset; + + PassAccessor::Destination destination(pass_data, 3); + destination.pixel_stride = buffer_params_.pass_stride; + + read_pass_pixels(oidn_input_pass, destination); + + OIDNPass oidn_input_pass_at_output = oidn_input_pass; + oidn_input_pass_at_output.offset = oidn_output_pass.offset; + + return oidn_input_pass_at_output; + } + + /* Read pass pixels using PassAccessor into the given destination. */ + void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination) + { + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = oidn_pass.type; + pass_access_info.mode = oidn_pass.mode; + pass_access_info.offset = oidn_pass.offset; + + /* Denoiser operates on passes which are used to calculate the approximation, and is never used + * on the approximation. The latter is not even possible because OIDN does not support + * denoising of semi-transparent pixels. */ + pass_access_info.use_approximate_shadow_catcher = false; + pass_access_info.use_approximate_shadow_catcher_background = false; + pass_access_info.show_active_pixels = false; + + /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured + * by users. What is important is to use same exposure for read and write access of the pass + * pixels. */ + const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_); + + pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination); + } + + /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */ + void read_pass_pixels_into_buffer(OIDNPass &oidn_pass) + { + VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " (" + << pass_type_as_string(oidn_pass.type) << ")"; + + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + array<float> &scaled_buffer = oidn_pass.scaled_buffer; + scaled_buffer.resize(width * height * 3); + + const PassAccessor::Destination destination(scaled_buffer.data(), 3); + + read_pass_pixels(oidn_pass, destination); + } + + /* Set OIDN image to reference pixels from the given render buffer pass. + * No transform to the pixels is done, no additional memory is used. */ + void set_pass_referenced(oidn::FilterRef &oidn_filter, + const char *name, + const OIDNPass &oidn_pass) + { + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + + const int64_t pixel_index = offset + x + y * stride; + const int64_t buffer_offset = pixel_index * pass_stride; + + float *buffer_data = render_buffers_->buffer.data(); + + oidn_filter.setImage(name, + buffer_data + buffer_offset + oidn_pass.offset, + oidn::Format::Float3, + width, + height, + 0, + pass_stride * sizeof(float), + stride * pass_stride * sizeof(float)); + } + + void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass) + { + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + oidn_filter.setImage( + name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0); + } + + void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass(oidn_filter, oidn_pass.name, oidn_pass); + } + void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass) + { + if (oidn_pass.scaled_buffer.empty()) { + set_pass_referenced(oidn_filter, name, oidn_pass); + } + else { + set_pass_from_buffer(oidn_filter, name, oidn_pass); + } + } + + void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass); + } + + void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + if (oidn_albedo_pass_) { + if (oidn_pass.use_denoising_albedo) { + set_pass(oidn_filter, oidn_albedo_pass_); + } + else { + /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been + * provided. */ + set_fake_albedo_pass(oidn_filter); + } + } + + if (oidn_normal_pass_) { + set_pass(oidn_filter, oidn_normal_pass_); + } + } + + void set_fake_albedo_pass(oidn::FilterRef &oidn_filter) + { + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + + if (!albedo_replaced_with_fake_) { + const int64_t num_pixel_components = width * height * 3; + oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components); + + for (int i = 0; i < num_pixel_components; ++i) { + oidn_albedo_pass_.scaled_buffer[i] = 0.5f; + } + + albedo_replaced_with_fake_ = true; + } + + set_pass(oidn_filter, oidn_albedo_pass_); + } + + void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass) + { + set_pass(oidn_filter, "output", oidn_pass); + } + + /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel + * back. */ + void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass) + { + kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components); + + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + const int64_t row_stride = stride * pass_stride; + + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * pass_stride); + + float *buffer_data = render_buffers_->buffer.data(); + + const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED); + const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing; + + for (int y = 0; y < height; ++y) { + float *buffer_row = buffer_data + buffer_offset + y * row_stride; + for (int x = 0; x < width; ++x) { + float *buffer_pixel = buffer_row + x * pass_stride; + float *denoised_pixel = buffer_pixel + oidn_output_pass.offset; + + if (need_scale) { + const float pixel_scale = has_pass_sample_count ? + __float_as_uint(buffer_pixel[pass_sample_count_]) : + num_samples_; + + denoised_pixel[0] = denoised_pixel[0] * pixel_scale; + denoised_pixel[1] = denoised_pixel[1] * pixel_scale; + denoised_pixel[2] = denoised_pixel[2] * pixel_scale; + } + + if (oidn_output_pass.num_components == 3) { + /* Pass without alpha channel. */ + } + else if (!oidn_input_pass.use_compositing) { + /* Currently compositing passes are either 3-component (derived by dividing light passes) + * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it + * simplifies logic and avoids extra memory allocation. */ + const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset; + denoised_pixel[3] = noisy_pixel[3]; + } + else { + /* Assigning to zero since this is a default alpha value for 3-component passes, and it + * is an opaque pixel for 4 component passes. */ + denoised_pixel[3] = 0; + } + } + } + } + + bool is_pass_scale_needed(OIDNPass &oidn_pass) const + { + if (pass_sample_count_ != PASS_UNUSED) { + /* With adaptive sampling pixels will have different number of samples in them, so need to + * always scale the pass to make pixels uniformly sampled. */ + return true; + } + + if (!oidn_pass.need_scale) { + return false; + } + + if (num_samples_ == 1) { + /* If the avoid scaling if there is only one sample, to save up time (so we dont divide + * buffer by 1). */ + return false; + } + + return true; + } + + void scale_pass_in_render_buffers(OIDNPass &oidn_pass) + { + const int64_t x = buffer_params_.full_x; + const int64_t y = buffer_params_.full_y; + const int64_t width = buffer_params_.width; + const int64_t height = buffer_params_.height; + const int64_t offset = buffer_params_.offset; + const int64_t stride = buffer_params_.stride; + const int64_t pass_stride = buffer_params_.pass_stride; + const int64_t row_stride = stride * pass_stride; + + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * pass_stride); + + float *buffer_data = render_buffers_->buffer.data(); + + const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED); + + for (int y = 0; y < height; ++y) { + float *buffer_row = buffer_data + buffer_offset + y * row_stride; + for (int x = 0; x < width; ++x) { + float *buffer_pixel = buffer_row + x * pass_stride; + float *pass_pixel = buffer_pixel + oidn_pass.offset; + + const float pixel_scale = 1.0f / (has_pass_sample_count ? + __float_as_uint(buffer_pixel[pass_sample_count_]) : + num_samples_); + + pass_pixel[0] = pass_pixel[0] * pixel_scale; + pass_pixel[1] = pass_pixel[1] * pixel_scale; + pass_pixel[2] = pass_pixel[2] * pixel_scale; + } + } + } + + OIDNDenoiser *denoiser_ = nullptr; + + const DenoiseParams &denoise_params_; + const BufferParams &buffer_params_; + RenderBuffers *render_buffers_ = nullptr; + int num_samples_ = 0; + bool allow_inplace_modification_ = false; + int pass_sample_count_ = PASS_UNUSED; + + /* Optional albedo and normal passes, reused by denoising of different pass types. */ + OIDNPass oidn_albedo_pass_; + OIDNPass oidn_normal_pass_; + + /* For passes which don't need albedo channel for denoising we replace the actual albedo with + * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with + * the fake values and denoising of passes which do need albedo can no longer happen. */ + bool albedo_replaced_with_fake_ = false; +}; +#endif + +static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers) +{ + Device *device = render_buffers->buffer.device; + if (device->info.has_gpu_queue) { + return device->gpu_queue_create(); + } + return nullptr; +} + +static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue, + RenderBuffers *render_buffers) +{ + if (queue) { + queue->copy_from_device(render_buffers->buffer); + queue->synchronize(); + } + else { + render_buffers->copy_from_device(); + } +} + +static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue, + RenderBuffers *render_buffers) +{ + if (queue) { + queue->copy_to_device(render_buffers->buffer); + queue->synchronize(); + } + else { + render_buffers->copy_to_device(); + } +} + +bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) +{ + thread_scoped_lock lock(mutex_); + + /* Make sure the host-side data is available for denoising. */ + unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers); + copy_render_buffers_from_device(queue, render_buffers); + +#ifdef WITH_OPENIMAGEDENOISE + OIDNDenoiseContext context( + this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification); + + if (context.need_denoising()) { + context.read_guiding_passes(); + + const std::array<PassType, 3> passes = { + {/* Passes which will use real albedo when it is available. */ + PASS_COMBINED, + PASS_SHADOW_CATCHER_MATTE, + + /* Passes which do not need albedo and hence if real is present it needs to become fake. + */ + PASS_SHADOW_CATCHER}}; + + for (const PassType pass_type : passes) { + context.denoise_pass(pass_type); + if (is_cancelled()) { + return false; + } + } + + /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code + * copies data from the device it doesn't overwrite the denoiser buffers. */ + copy_render_buffers_to_device(queue, render_buffers); + } +#endif + + /* This code is not supposed to run when compiled without OIDN support, so can assume if we made + * it up here all passes are properly denoised. */ + return true; +} + +uint OIDNDenoiser::get_device_type_mask() const +{ + return DEVICE_MASK_CPU; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h new file mode 100644 index 00000000000..566e761ae79 --- /dev/null +++ b/intern/cycles/integrator/denoiser_oidn.h @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "util/util_thread.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +/* Implementation of denoising API which uses OpenImageDenoise library. */ +class OIDNDenoiser : public Denoiser { + public: + /* Forwardly declared state which might be using compile-flag specific fields, such as + * OpenImageDenoise device and filter handles. */ + class State; + + OIDNDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + virtual bool denoise_buffer(const BufferParams &buffer_params, + RenderBuffers *render_buffers, + const int num_samples, + bool allow_inplace_modification) override; + + protected: + virtual uint get_device_type_mask() const override; + + /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded. + * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */ + static thread_mutex mutex_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/integrator/denoiser_optix.cpp index ed64ae01aae..5f9de23bfe6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/integrator/denoiser_optix.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,13 +14,21 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_direct_lighting.h" +#include "integrator/denoiser_optix.h" -#define KERNEL_NAME direct_lighting -#define LOCALS_TYPE unsigned int -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE +#include "device/device.h" +#include "device/device_denoise.h" +CCL_NAMESPACE_BEGIN + +OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams ¶ms) + : DeviceDenoiser(path_trace_device, params) +{ +} + +uint OptiXDenoiser::get_device_type_mask() const +{ + return DEVICE_MASK_OPTIX; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/integrator/denoiser_optix.h index c314dc96c33..a8df770ecf7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/integrator/denoiser_optix.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,18 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_lamp_emission.h" +#pragma once -#define KERNEL_NAME lamp_emission -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME +#include "integrator/denoiser_device.h" +CCL_NAMESPACE_BEGIN + +class OptiXDenoiser : public DeviceDenoiser { + public: + OptiXDenoiser(Device *path_trace_device, const DenoiseParams ¶ms); + + protected: + virtual uint get_device_type_mask() const override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp new file mode 100644 index 00000000000..87c048b1fa5 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor.cpp @@ -0,0 +1,318 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor.h" + +#include "render/buffers.h" +#include "util/util_logging.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/kernel_types.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Pass input information. + */ + +PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass) + : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset) +{ +} + +/* -------------------------------------------------------------------- + * Pass destination. + */ + +PassAccessor::Destination::Destination(float *pixels, int num_components) + : pixels(pixels), num_components(num_components) +{ +} + +PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels) + : Destination(pass_type) +{ + pixels_half_rgba = pixels; +} + +PassAccessor::Destination::Destination(const PassType pass_type) +{ + const PassInfo pass_info = Pass::get_info(pass_type); + num_components = pass_info.num_components; +} + +/* -------------------------------------------------------------------- + * Pass source. + */ + +PassAccessor::Source::Source(const float *pixels, int num_components) + : pixels(pixels), num_components(num_components) +{ +} + +/* -------------------------------------------------------------------- + * Pass accessor. + */ + +PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples) + : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples) +{ +} + +bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers, + const Destination &destination) const +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + return get_render_tile_pixels(render_buffers, render_buffers->params, destination); +} + +static void pad_pixels(const BufferParams &buffer_params, + const PassAccessor::Destination &destination, + const int src_num_components) +{ + /* When requesting a single channel pass as RGBA, or RGB pass as RGBA, + * fill in the additional components for convenience. */ + const int dest_num_components = destination.num_components; + + if (src_num_components >= dest_num_components) { + return; + } + + const size_t size = buffer_params.width * buffer_params.height; + if (destination.pixels) { + float *pixel = destination.pixels; + + for (size_t i = 0; i < size; i++, pixel += dest_num_components) { + if (dest_num_components >= 3 && src_num_components == 1) { + pixel[1] = pixel[0]; + pixel[2] = pixel[0]; + } + if (dest_num_components >= 4) { + pixel[3] = 1.0f; + } + } + } + + if (destination.pixels_half_rgba) { + const half one = float_to_half(1.0f); + half4 *pixel = destination.pixels_half_rgba; + + for (size_t i = 0; i < size; i++, pixel++) { + if (dest_num_components >= 3 && src_num_components == 1) { + pixel[0].y = pixel[0].x; + pixel[0].z = pixel[0].x; + } + if (dest_num_components >= 4) { + pixel[0].w = one; + } + } + } +} + +bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + if (pass_access_info_.offset == PASS_UNUSED) { + return false; + } + + const PassType type = pass_access_info_.type; + const PassMode mode = pass_access_info_.mode; + const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo); + + if (pass_info.num_components == 1) { + /* Single channel passes. */ + if (mode == PassMode::DENOISED) { + /* Denoised passes store their final pixels, no need in special calculation. */ + get_pass_float(render_buffers, buffer_params, destination); + } + else if (type == PASS_RENDER_TIME) { + /* TODO(sergey): Needs implementation. */ + } + else if (type == PASS_DEPTH) { + get_pass_depth(render_buffers, buffer_params, destination); + } + else if (type == PASS_MIST) { + get_pass_mist(render_buffers, buffer_params, destination); + } + else if (type == PASS_SAMPLE_COUNT) { + get_pass_sample_count(render_buffers, buffer_params, destination); + } + else { + get_pass_float(render_buffers, buffer_params, destination); + } + } + else if (type == PASS_MOTION) { + /* Motion pass. */ + DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components"; + get_pass_motion(render_buffers, buffer_params, destination); + } + else if (type == PASS_CRYPTOMATTE) { + /* Cryptomatte pass. */ + DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components"; + get_pass_cryptomatte(render_buffers, buffer_params, destination); + } + else { + /* RGB, RGBA and vector passes. */ + DCHECK(destination.num_components == 3 || destination.num_components == 4) + << pass_type_as_string(type) << " pass must have 3 or 4 components"; + + if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) { + /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass + * to approximate shadow with). */ + get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination); + } + else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) { + /* Shadow catcher pass. */ + get_pass_shadow_catcher(render_buffers, buffer_params, destination); + } + else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE || + pass_info.indirect_type != PASS_NONE) && + mode != PassMode::DENOISED) { + /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */ + get_pass_light_path(render_buffers, buffer_params, destination); + } + else { + /* Passes that need no special computation, or denoised passes that already + * had the computation done. */ + if (pass_info.num_components == 3) { + get_pass_float3(render_buffers, buffer_params, destination); + } + else if (pass_info.num_components == 4) { + if (destination.num_components == 3) { + /* Special case for denoiser access of RGBA passes ignoring alpha channel. */ + get_pass_float3(render_buffers, buffer_params, destination); + } + else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER || + type == PASS_SHADOW_CATCHER_MATTE) { + /* Passes with transparency as 4th component. */ + get_pass_combined(render_buffers, buffer_params, destination); + } + else { + /* Passes with alpha as 4th component. */ + get_pass_float4(render_buffers, buffer_params, destination); + } + } + } + } + + pad_pixels(buffer_params, destination, pass_info.num_components); + + return true; +} + +void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert, + const BufferParams &buffer_params, + const Destination &destination) const +{ + const PassMode mode = pass_access_info_.mode; + const PassInfo &pass_info = Pass::get_info(pass_access_info_.type, + pass_access_info_.include_albedo); + + kfilm_convert->pass_offset = pass_access_info_.offset; + kfilm_convert->pass_stride = buffer_params.pass_stride; + + kfilm_convert->pass_use_exposure = pass_info.use_exposure; + kfilm_convert->pass_use_filter = pass_info.use_filter; + + /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */ + if (pass_info.direct_type != PASS_NONE) { + kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type); + } + kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type); + kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type); + + kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED); + kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT); + kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset( + PASS_ADAPTIVE_AUX_BUFFER); + kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT); + kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode); + kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset( + PASS_SHADOW_CATCHER_SAMPLE_COUNT); + kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset( + PASS_SHADOW_CATCHER_MATTE, mode); + + /* Background is not denoised, so always use noisy pass. */ + kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND); + + if (pass_info.use_filter) { + kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f; + } + else { + kfilm_convert->scale = 1.0f; + } + + if (pass_info.use_exposure) { + kfilm_convert->exposure = exposure_; + } + else { + kfilm_convert->exposure = 1.0f; + } + + kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure; + + kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher; + kfilm_convert->use_approximate_shadow_catcher_background = + pass_access_info_.use_approximate_shadow_catcher_background; + kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels; + + kfilm_convert->num_components = destination.num_components; + kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride : + destination.num_components; + + kfilm_convert->is_denoised = (mode == PassMode::DENOISED); +} + +bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source) +{ + if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) { + return false; + } + + const PassInfo pass_info = Pass::get_info(pass_access_info_.type, + pass_access_info_.include_albedo); + + const BufferParams &buffer_params = render_buffers->params; + + float *buffer_data = render_buffers->buffer.data(); + const int size = buffer_params.width * buffer_params.height; + + const int out_stride = buffer_params.pass_stride; + const int in_stride = source.num_components; + const int num_components_to_copy = min(source.num_components, pass_info.num_components); + + float *out = buffer_data + pass_access_info_.offset; + const float *in = source.pixels + source.offset * in_stride; + + for (int i = 0; i < size; i++, out += out_stride, in += in_stride) { + memcpy(out, in, sizeof(float) * num_components_to_copy); + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h new file mode 100644 index 00000000000..624bf7d0b2c --- /dev/null +++ b/intern/cycles/integrator/pass_accessor.h @@ -0,0 +1,160 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "render/pass.h" +#include "util/util_half.h" +#include "util/util_string.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +class RenderBuffers; +class BufferPass; +class BufferParams; +struct KernelFilmConvert; + +/* Helper class which allows to access pass data. + * Is designed in a way that it is created once when the pass data is known, and then pixels gets + * progressively update from various render buffers. */ +class PassAccessor { + public: + class PassAccessInfo { + public: + PassAccessInfo() = default; + explicit PassAccessInfo(const BufferPass &pass); + + PassType type = PASS_NONE; + PassMode mode = PassMode::NOISY; + bool include_albedo = false; + int offset = -1; + + /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its + * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop. + */ + bool use_approximate_shadow_catcher = false; + + /* When approximate shadow catcher matte is used alpha-over the result on top of background. */ + bool use_approximate_shadow_catcher_background = false; + + bool show_active_pixels = false; + }; + + class Destination { + public: + Destination() = default; + Destination(float *pixels, int num_components); + Destination(const PassType pass_type, half4 *pixels); + + /* Destination will be initialized with the number of components which is native for the given + * pass type. */ + explicit Destination(const PassType pass_type); + + /* CPU-side pointers. only usable by the `PassAccessorCPU`. */ + float *pixels = nullptr; + half4 *pixels_half_rgba = nullptr; + + /* Device-side pointers. */ + device_ptr d_pixels = 0; + device_ptr d_pixels_half_rgba = 0; + + /* Number of components per pixel in the floating-point destination. + * Is ignored for half4 destination (where number of components is implied to be 4). */ + int num_components = 0; + + /* Offset in pixels from the beginning of pixels storage. + * Allows to get pixels of render buffer into a partial slice of the destination. */ + int offset = 0; + + /* Number of floats per pixel. When zero is the same as `num_components`. + * + * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component + * half-floats. */ + int pixel_stride = 0; + + /* Row stride in pixel elements: + * - For the float destination stride is a number of floats per row. + * - For the half4 destination stride is a number of half4 per row. */ + int stride = 0; + }; + + class Source { + public: + Source() = default; + Source(const float *pixels, int num_components); + + /* CPU-side pointers. only usable by the `PassAccessorCPU`. */ + const float *pixels = nullptr; + int num_components = 0; + + /* Offset in pixels from the beginning of pixels storage. + * Allows to get pixels of render buffer into a partial slice of the destination. */ + int offset = 0; + }; + + PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples); + + virtual ~PassAccessor() = default; + + /* Get pass data from the given render buffers, perform needed filtering, and store result into + * the pixels. + * The result is stored sequentially starting from the very beginning of the pixels memory. */ + bool get_render_tile_pixels(const RenderBuffers *render_buffers, + const Destination &destination) const; + bool get_render_tile_pixels(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const; + /* Set pass data for the given render buffers. Used for baking to read from passes. */ + bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source); + + protected: + virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert, + const BufferParams &buffer_params, + const Destination &destination) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const = 0; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth) + DECLARE_PASS_ACCESSOR(mist) + DECLARE_PASS_ACCESSOR(sample_count) + DECLARE_PASS_ACCESSOR(float) + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path) + DECLARE_PASS_ACCESSOR(shadow_catcher) + DECLARE_PASS_ACCESSOR(float3) + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion) + DECLARE_PASS_ACCESSOR(cryptomatte) + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) + DECLARE_PASS_ACCESSOR(combined) + DECLARE_PASS_ACCESSOR(float4) + +#undef DECLARE_PASS_ACCESSOR + + PassAccessInfo pass_access_info_; + + float exposure_ = 0.0f; + int num_samples_ = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp new file mode 100644 index 00000000000..3c6691f6d43 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_cpu.cpp @@ -0,0 +1,183 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor_cpu.h" + +#include "render/buffers.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" + +// clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_film.h" +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Kernel processing. + */ + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + KernelFilmConvert kfilm_convert; + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); + + if (destination.pixels) { + /* NOTE: No overlays are applied since they are not used for final renders. + * Can be supported via some sort of specialization to avoid code duplication. */ + + run_get_pass_kernel_processor_float( + &kfilm_convert, render_buffers, buffer_params, destination, processor); + } + + if (destination.pixels_half_rgba) { + /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */ + + if (destination.num_components == 1) { + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, + render_buffers, + buffer_params, + destination, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + float pixel; + processor(kfilm_convert, buffer, &pixel); + + pixel_rgba[0] = pixel; + pixel_rgba[1] = pixel; + pixel_rgba[2] = pixel; + pixel_rgba[3] = 1.0f; + }); + } + else if (destination.num_components == 3) { + run_get_pass_kernel_processor_half_rgba(&kfilm_convert, + render_buffers, + buffer_params, + destination, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + processor(kfilm_convert, buffer, pixel_rgba); + pixel_rgba[3] = 1.0f; + }); + } + else if (destination.num_components == 4) { + run_get_pass_kernel_processor_half_rgba( + &kfilm_convert, render_buffers, buffer_params, destination, processor); + } + } +} + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor_float( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; + + const float *buffer_data = render_buffers->buffer.data(); + const int pixel_stride = destination.pixel_stride ? destination.pixel_stride : + destination.num_components; + + tbb::parallel_for(0, buffer_params.height, [&](int64_t y) { + int64_t pixel_index = y * buffer_params.width; + for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) { + const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride; + const float *buffer = buffer_data + input_pixel_offset; + float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride; + + processor(kfilm_convert, buffer, pixel); + } + }); +} + +template<typename Processor> +inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba( + const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const +{ + const float *buffer_data = render_buffers->buffer.data(); + + half4 *dst_start = destination.pixels_half_rgba + destination.offset; + const int destination_stride = destination.stride != 0 ? destination.stride : + buffer_params.width; + + tbb::parallel_for(0, buffer_params.height, [&](int64_t y) { + int64_t pixel_index = y * buffer_params.width; + half4 *dst_row_start = dst_start + y * destination_stride; + for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) { + const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride; + const float *buffer = buffer_data + input_pixel_offset; + + float pixel[4]; + processor(kfilm_convert, buffer, pixel); + + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); + + half4 *pixel_half_rgba = dst_row_start + x; + float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); + } + }); +} + +/* -------------------------------------------------------------------- + * Pass accessors. + */ + +#define DEFINE_PASS_ACCESSOR(pass) \ + void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const \ + { \ + run_get_pass_kernel_processor( \ + render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \ + } + +/* Float (scalar) passes. */ +DEFINE_PASS_ACCESSOR(depth) +DEFINE_PASS_ACCESSOR(mist) +DEFINE_PASS_ACCESSOR(sample_count) +DEFINE_PASS_ACCESSOR(float) + +/* Float3 passes. */ +DEFINE_PASS_ACCESSOR(light_path) +DEFINE_PASS_ACCESSOR(shadow_catcher) +DEFINE_PASS_ACCESSOR(float3) + +/* Float4 passes. */ +DEFINE_PASS_ACCESSOR(motion) +DEFINE_PASS_ACCESSOR(cryptomatte) +DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) +DEFINE_PASS_ACCESSOR(combined) +DEFINE_PASS_ACCESSOR(float4) + +#undef DEFINE_PASS_ACCESSOR + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h new file mode 100644 index 00000000000..0313dc5bb0d --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_cpu.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" + +CCL_NAMESPACE_BEGIN + +struct KernelFilmConvert; + +/* Pass accessor implementation for CPU side. */ +class PassAccessorCPU : public PassAccessor { + public: + using PassAccessor::PassAccessor; + + protected: + template<typename Processor> + inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + + template<typename Processor> + inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + + template<typename Processor> + inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination, + const Processor &processor) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const override; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth) + DECLARE_PASS_ACCESSOR(mist) + DECLARE_PASS_ACCESSOR(sample_count) + DECLARE_PASS_ACCESSOR(float) + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path) + DECLARE_PASS_ACCESSOR(shadow_catcher) + DECLARE_PASS_ACCESSOR(float3) + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion) + DECLARE_PASS_ACCESSOR(cryptomatte) + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow) + DECLARE_PASS_ACCESSOR(combined) + DECLARE_PASS_ACCESSOR(float4) + +#undef DECLARE_PASS_ACCESSOR +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp new file mode 100644 index 00000000000..eb80ba99655 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_gpu.cpp @@ -0,0 +1,118 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/pass_accessor_gpu.h" + +#include "device/device_queue.h" +#include "render/buffers.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue, + const PassAccessInfo &pass_access_info, + float exposure, + int num_samples) + : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue) + +{ +} + +/* -------------------------------------------------------------------- + * Kernel execution. + */ + +void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const +{ + KernelFilmConvert kfilm_convert; + init_kernel_film_convert(&kfilm_convert, buffer_params, destination); + + const int work_size = buffer_params.width * buffer_params.height; + + const int destination_stride = destination.stride != 0 ? destination.stride : + buffer_params.width; + + if (destination.d_pixels) { + DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented."; + + void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert), + const_cast<device_ptr *>(&destination.d_pixels), + const_cast<device_ptr *>(&render_buffers->buffer.device_pointer), + const_cast<int *>(&work_size), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&destination.offset), + const_cast<int *>(&destination_stride)}; + + queue_->enqueue(kernel, work_size, args); + } + if (destination.d_pixels_half_rgba) { + const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1); + + void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert), + const_cast<device_ptr *>(&destination.d_pixels_half_rgba), + const_cast<device_ptr *>(&render_buffers->buffer.device_pointer), + const_cast<int *>(&work_size), + const_cast<int *>(&buffer_params.width), + const_cast<int *>(&buffer_params.offset), + const_cast<int *>(&buffer_params.stride), + const_cast<int *>(&destination.offset), + const_cast<int *>(&destination_stride)}; + + queue_->enqueue(kernel_half_float, work_size, args); + } + + queue_->synchronize(); +} + +/* -------------------------------------------------------------------- + * Pass accessors. + */ + +#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \ + void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const \ + { \ + run_film_convert_kernels( \ + DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \ + } + +/* Float (scalar) passes. */ +DEFINE_PASS_ACCESSOR(depth, DEPTH); +DEFINE_PASS_ACCESSOR(mist, MIST); +DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT); +DEFINE_PASS_ACCESSOR(float, FLOAT); + +/* Float3 passes. */ +DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH); +DEFINE_PASS_ACCESSOR(float3, FLOAT3); + +/* Float4 passes. */ +DEFINE_PASS_ACCESSOR(motion, MOTION); +DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE); +DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER); +DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW); +DEFINE_PASS_ACCESSOR(combined, COMBINED); +DEFINE_PASS_ACCESSOR(float4, FLOAT4); + +#undef DEFINE_PASS_ACCESSOR + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h new file mode 100644 index 00000000000..bc37e4387f3 --- /dev/null +++ b/intern/cycles/integrator/pass_accessor_gpu.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +/* Pass accessor implementation for GPU side. */ +class PassAccessorGPU : public PassAccessor { + public: + PassAccessorGPU(DeviceQueue *queue, + const PassAccessInfo &pass_access_info, + float exposure, + int num_samples); + + protected: + void run_film_convert_kernels(DeviceKernel kernel, + const RenderBuffers *render_buffers, + const BufferParams &buffer_params, + const Destination &destination) const; + +#define DECLARE_PASS_ACCESSOR(pass) \ + virtual void get_pass_##pass(const RenderBuffers *render_buffers, \ + const BufferParams &buffer_params, \ + const Destination &destination) const override; + + /* Float (scalar) passes. */ + DECLARE_PASS_ACCESSOR(depth); + DECLARE_PASS_ACCESSOR(mist); + DECLARE_PASS_ACCESSOR(sample_count); + DECLARE_PASS_ACCESSOR(float); + + /* Float3 passes. */ + DECLARE_PASS_ACCESSOR(light_path); + DECLARE_PASS_ACCESSOR(float3); + + /* Float4 passes. */ + DECLARE_PASS_ACCESSOR(motion); + DECLARE_PASS_ACCESSOR(cryptomatte); + DECLARE_PASS_ACCESSOR(shadow_catcher); + DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow); + DECLARE_PASS_ACCESSOR(combined); + DECLARE_PASS_ACCESSOR(float4); + +#undef DECLARE_PASS_ACCESSOR + + DeviceQueue *queue_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp new file mode 100644 index 00000000000..6c02316ac2b --- /dev/null +++ b/intern/cycles/integrator/path_trace.cpp @@ -0,0 +1,1147 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace.h" + +#include "device/cpu/device.h" +#include "device/device.h" +#include "integrator/pass_accessor.h" +#include "integrator/render_scheduler.h" +#include "render/gpu_display.h" +#include "render/pass.h" +#include "render/scene.h" +#include "render/tile.h" +#include "util/util_algorithm.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_tbb.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +PathTrace::PathTrace(Device *device, + Film *film, + DeviceScene *device_scene, + RenderScheduler &render_scheduler, + TileManager &tile_manager) + : device_(device), + device_scene_(device_scene), + render_scheduler_(render_scheduler), + tile_manager_(tile_manager) +{ + DCHECK_NE(device_, nullptr); + + { + vector<DeviceInfo> cpu_devices; + device_cpu_info(cpu_devices); + + cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler)); + } + + /* Create path tracing work in advance, so that it can be reused by incremental sampling as much + * as possible. */ + device_->foreach_device([&](Device *path_trace_device) { + path_trace_works_.emplace_back(PathTraceWork::create( + path_trace_device, film, device_scene, &render_cancel_.is_requested)); + }); + + work_balance_infos_.resize(path_trace_works_.size()); + work_balance_do_initial(work_balance_infos_); + + render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1); +} + +PathTrace::~PathTrace() +{ + /* Destroy any GPU resource which was used for graphics interop. + * Need to have access to the GPUDisplay as it is the only source of drawing context which is + * used for interop. */ + if (gpu_display_) { + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->destroy_gpu_resources(gpu_display_.get()); + } + } +} + +void PathTrace::load_kernels() +{ + if (denoiser_) { + denoiser_->load_kernels(progress_); + } +} + +void PathTrace::alloc_work_memory() +{ + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->alloc_work_memory(); + } +} + +bool PathTrace::ready_to_reset() +{ + /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU + * display. Of there is no such display, the logic here will break. */ + DCHECK(gpu_display_); + + /* The logic here tries to provide behavior which feels the most interactive feel to artists. + * General idea is to be able to reset as quickly as possible, while still providing interactive + * feel. + * + * If the render result was ever drawn after previous reset, consider that reset is now possible. + * This way camera navigation gives the quickest feedback of rendered pixels, regardless of + * whether CPU or GPU drawing pipeline is used. + * + * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit + * arbitrary, but seems to work very well with viewport navigation in Blender. */ + + if (did_draw_after_reset_) { + return true; + } + + return false; +} + +void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params) +{ + if (big_tile_params_.modified(big_tile_params)) { + big_tile_params_ = big_tile_params; + render_state_.need_reset_params = true; + } + + full_params_ = full_params; + + /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation. + * It is requires to inform about reset whenever it happens, so that the redraw state tracking is + * properly updated. */ + if (gpu_display_) { + gpu_display_->reset(full_params); + } + + render_state_.has_denoised_result = false; + render_state_.tile_written = false; + + did_draw_after_reset_ = false; +} + +void PathTrace::device_free() +{ + /* Free render buffers used by the path trace work to reduce memory peak. */ + BufferParams empty_params; + empty_params.pass_stride = 0; + empty_params.update_offset_stride(); + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->get_render_buffers()->reset(empty_params); + } + render_state_.need_reset_params = true; +} + +void PathTrace::set_progress(Progress *progress) +{ + progress_ = progress; +} + +void PathTrace::render(const RenderWork &render_work) +{ + /* Indicate that rendering has started and that it can be requested to cancel. */ + { + thread_scoped_lock lock(render_cancel_.mutex); + if (render_cancel_.is_requested) { + return; + } + render_cancel_.is_rendering = true; + } + + render_pipeline(render_work); + + /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry + * on. */ + { + thread_scoped_lock lock(render_cancel_.mutex); + render_cancel_.is_rendering = false; + render_cancel_.condition.notify_one(); + } +} + +void PathTrace::render_pipeline(RenderWork render_work) +{ + /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is + * checked in Session and the work in the event of cancel is to be finished here. */ + + render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes != + 0); + + render_init_kernel_execution(); + + render_scheduler_.report_work_begin(render_work); + + init_render_buffers(render_work); + + rebalance(render_work); + + path_trace(render_work); + if (render_cancel_.is_requested) { + return; + } + + adaptive_sample(render_work); + if (render_cancel_.is_requested) { + return; + } + + cryptomatte_postprocess(render_work); + if (render_cancel_.is_requested) { + return; + } + + denoise(render_work); + if (render_cancel_.is_requested) { + return; + } + + write_tile_buffer(render_work); + update_display(render_work); + + progress_update_if_needed(render_work); + + finalize_full_buffer_on_disk(render_work); +} + +void PathTrace::render_init_kernel_execution() +{ + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->init_execution(); + } +} + +/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a + * measurable performance impact at runtime, but will make compilation faster and binary somewhat + * smaller. */ +template<typename Callback> +static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works, + const vector<WorkBalanceInfo> &work_balance_infos, + const BufferParams &buffer_params, + const Callback &callback) +{ + const int num_works = path_trace_works.size(); + const int height = buffer_params.height; + + int current_y = 0; + for (int i = 0; i < num_works; ++i) { + const double weight = work_balance_infos[i].weight; + const int slice_height = max(lround(height * weight), 1); + + /* Disallow negative values to deal with situations when there are more compute devices than + * scanlines. */ + const int remaining_height = max(0, height - current_y); + + BufferParams slide_params = buffer_params; + slide_params.full_y = buffer_params.full_y + current_y; + if (i < num_works - 1) { + slide_params.height = min(slice_height, remaining_height); + } + else { + slide_params.height = remaining_height; + } + + slide_params.update_offset_stride(); + + callback(path_trace_works[i].get(), slide_params); + + current_y += slide_params.height; + } +} + +void PathTrace::update_allocated_work_buffer_params() +{ + foreach_sliced_buffer_params(path_trace_works_, + work_balance_infos_, + big_tile_params_, + [](PathTraceWork *path_trace_work, const BufferParams ¶ms) { + RenderBuffers *buffers = path_trace_work->get_render_buffers(); + buffers->reset(params); + }); +} + +static BufferParams scale_buffer_params(const BufferParams ¶ms, int resolution_divider) +{ + BufferParams scaled_params = params; + + scaled_params.width = max(1, params.width / resolution_divider); + scaled_params.height = max(1, params.height / resolution_divider); + scaled_params.full_x = params.full_x / resolution_divider; + scaled_params.full_y = params.full_y / resolution_divider; + scaled_params.full_width = params.full_width / resolution_divider; + scaled_params.full_height = params.full_height / resolution_divider; + + scaled_params.update_offset_stride(); + + return scaled_params; +} + +void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work) +{ + const int resolution_divider = render_work.resolution_divider; + + const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider); + const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_, + resolution_divider); + + foreach_sliced_buffer_params(path_trace_works_, + work_balance_infos_, + scaled_big_tile_params, + [&](PathTraceWork *path_trace_work, const BufferParams params) { + path_trace_work->set_effective_buffer_params( + scaled_full_params, scaled_big_tile_params, params); + }); + + render_state_.effective_big_tile_params = scaled_big_tile_params; +} + +void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work) +{ + if (render_state_.need_reset_params) { + update_allocated_work_buffer_params(); + } + + if (render_state_.need_reset_params || + render_state_.resolution_divider != render_work.resolution_divider) { + update_effective_work_buffer_params(render_work); + } + + render_state_.resolution_divider = render_work.resolution_divider; + render_state_.need_reset_params = false; +} + +void PathTrace::init_render_buffers(const RenderWork &render_work) +{ + update_work_buffer_params_if_needed(render_work); + + /* Handle initialization scheduled by the render scheduler. */ + if (render_work.init_render_buffers) { + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->zero_render_buffers(); + }); + + tile_buffer_read(); + } +} + +void PathTrace::path_trace(RenderWork &render_work) +{ + if (!render_work.path_trace.num_samples) { + return; + } + + VLOG(3) << "Will path trace " << render_work.path_trace.num_samples + << " samples at the resolution divider " << render_work.resolution_divider; + + const double start_time = time_dt(); + + const int num_works = path_trace_works_.size(); + + tbb::parallel_for(0, num_works, [&](int i) { + const double work_start_time = time_dt(); + const int num_samples = render_work.path_trace.num_samples; + + PathTraceWork *path_trace_work = path_trace_works_[i].get(); + + PathTraceWork::RenderStatistics statistics; + path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + + const double work_time = time_dt() - work_start_time; + work_balance_infos_[i].time_spent += work_time; + work_balance_infos_[i].occupancy = statistics.occupancy; + + VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + << work_time / num_samples + << " seconds per sample), occupancy: " << statistics.occupancy; + }); + + float occupancy_accum = 0.0f; + for (const WorkBalanceInfo &balance_info : work_balance_infos_) { + occupancy_accum += balance_info.occupancy; + } + const float occupancy = occupancy_accum / num_works; + render_scheduler_.report_path_trace_occupancy(render_work, occupancy); + + render_scheduler_.report_path_trace_time( + render_work, time_dt() - start_time, is_cancel_requested()); +} + +void PathTrace::adaptive_sample(RenderWork &render_work) +{ + if (!render_work.adaptive_sampling.filter) { + return; + } + + bool did_reschedule_on_idle = false; + + while (true) { + VLOG(3) << "Will filter adaptive stopping buffer, threshold " + << render_work.adaptive_sampling.threshold; + if (render_work.adaptive_sampling.reset) { + VLOG(3) << "Will re-calculate convergency flag for currently converged pixels."; + } + + const double start_time = time_dt(); + + uint num_active_pixels = 0; + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + const uint num_active_pixels_in_work = + path_trace_work->adaptive_sampling_converge_filter_count_active( + render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset); + if (num_active_pixels_in_work) { + atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work); + } + }); + + render_scheduler_.report_adaptive_filter_time( + render_work, time_dt() - start_time, is_cancel_requested()); + + if (num_active_pixels == 0) { + VLOG(3) << "All pixels converged."; + if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) { + break; + } + VLOG(3) << "Continuing with lower threshold."; + } + else if (did_reschedule_on_idle) { + break; + } + else if (num_active_pixels < 128 * 128) { + /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that + * there is no performance loss from the progressive noise floor feature. + * + * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of + * the final resolution. */ + if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) { + VLOG(3) << "Rescheduling is not possible: final threshold is reached."; + break; + } + VLOG(3) << "Rescheduling lower threshold."; + did_reschedule_on_idle = true; + } + else { + break; + } + } +} + +void PathTrace::set_denoiser_params(const DenoiseParams ¶ms) +{ + render_scheduler_.set_denoiser_params(params); + + if (!params.use) { + denoiser_.reset(); + return; + } + + if (denoiser_) { + const DenoiseParams old_denoiser_params = denoiser_->get_params(); + if (old_denoiser_params.type == params.type) { + denoiser_->set_params(params); + return; + } + } + + denoiser_ = Denoiser::create(device_, params); + denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); }; +} + +void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling) +{ + render_scheduler_.set_adaptive_sampling(adaptive_sampling); +} + +void PathTrace::cryptomatte_postprocess(const RenderWork &render_work) +{ + if (!render_work.cryptomatte.postprocess) { + return; + } + VLOG(3) << "Perform cryptomatte work."; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->cryptomatte_postproces(); + }); +} + +void PathTrace::denoise(const RenderWork &render_work) +{ + if (!render_work.tile.denoise) { + return; + } + + if (!denoiser_) { + /* Denoiser was not configured, so nothing to do here. */ + return; + } + + VLOG(3) << "Perform denoising work."; + + const double start_time = time_dt(); + + RenderBuffers *buffer_to_denoise = nullptr; + + unique_ptr<RenderBuffers> multi_device_buffers; + bool allow_inplace_modification = false; + + if (path_trace_works_.size() == 1) { + buffer_to_denoise = path_trace_works_.front()->get_render_buffers(); + } + else { + Device *denoiser_device = denoiser_->get_denoiser_device(); + if (!denoiser_device) { + return; + } + + multi_device_buffers = make_unique<RenderBuffers>(denoiser_device); + multi_device_buffers->reset(render_state_.effective_big_tile_params); + + buffer_to_denoise = multi_device_buffers.get(); + + copy_to_render_buffers(multi_device_buffers.get()); + + allow_inplace_modification = true; + } + + if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params, + buffer_to_denoise, + get_num_samples_in_buffer(), + allow_inplace_modification)) { + render_state_.has_denoised_result = true; + } + + if (multi_device_buffers) { + multi_device_buffers->copy_from_device(); + tbb::parallel_for_each( + path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get()); + }); + } + + render_scheduler_.report_denoise_time(render_work, time_dt() - start_time); +} + +void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display) +{ + gpu_display_ = move(gpu_display); +} + +void PathTrace::clear_gpu_display() +{ + if (gpu_display_) { + gpu_display_->clear(); + } +} + +void PathTrace::draw() +{ + if (!gpu_display_) { + return; + } + + did_draw_after_reset_ |= gpu_display_->draw(); +} + +void PathTrace::update_display(const RenderWork &render_work) +{ + if (!render_work.display.update) { + return; + } + + if (!gpu_display_ && !tile_buffer_update_cb) { + VLOG(3) << "Ignore display update."; + return; + } + + if (full_params_.width == 0 || full_params_.height == 0) { + VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer."; + return; + } + + const double start_time = time_dt(); + + if (tile_buffer_update_cb) { + VLOG(3) << "Invoke buffer update callback."; + + tile_buffer_update_cb(); + } + + if (gpu_display_) { + VLOG(3) << "Perform copy to GPUDisplay work."; + + const int resolution_divider = render_work.resolution_divider; + const int texture_width = max(1, full_params_.width / resolution_divider); + const int texture_height = max(1, full_params_.height / resolution_divider); + if (!gpu_display_->update_begin(texture_width, texture_height)) { + LOG(ERROR) << "Error beginning GPUDisplay update."; + return; + } + + const PassMode pass_mode = render_work.display.use_denoised_result && + render_state_.has_denoised_result ? + PassMode::DENOISED : + PassMode::NOISY; + + /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from + * all works in parallel. */ + const int num_samples = get_num_samples_in_buffer(); + for (auto &&path_trace_work : path_trace_works_) { + path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples); + } + + gpu_display_->update_end(); + } + + render_scheduler_.report_display_update_time(render_work, time_dt() - start_time); +} + +void PathTrace::rebalance(const RenderWork &render_work) +{ + static const int kLogLevel = 3; + + if (!render_work.rebalance) { + return; + } + + const int num_works = path_trace_works_.size(); + + if (num_works == 1) { + VLOG(kLogLevel) << "Ignoring rebalance work due to single device render."; + return; + } + + const double start_time = time_dt(); + + if (VLOG_IS_ON(kLogLevel)) { + VLOG(kLogLevel) << "Perform rebalance work."; + VLOG(kLogLevel) << "Per-device path tracing time (seconds):"; + for (int i = 0; i < num_works; ++i) { + VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": " + << work_balance_infos_[i].time_spent; + } + } + + const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_); + + if (VLOG_IS_ON(kLogLevel)) { + VLOG(kLogLevel) << "Calculated per-device weights for works:"; + for (int i = 0; i < num_works; ++i) { + VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": " + << work_balance_infos_[i].weight; + } + } + + if (!did_rebalance) { + VLOG(kLogLevel) << "Balance in path trace works did not change."; + render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false); + return; + } + + RenderBuffers big_tile_cpu_buffers(cpu_device_.get()); + big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params); + + copy_to_render_buffers(&big_tile_cpu_buffers); + + render_state_.need_reset_params = true; + update_work_buffer_params_if_needed(render_work); + + copy_from_render_buffers(&big_tile_cpu_buffers); + + render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true); +} + +void PathTrace::write_tile_buffer(const RenderWork &render_work) +{ + if (!render_work.tile.write) { + return; + } + + VLOG(3) << "Write tile result."; + + render_state_.tile_written = true; + + const bool has_multiple_tiles = tile_manager_.has_multiple_tiles(); + + /* Write render tile result, but only if not using tiled rendering. + * + * Tiles are written to a file during rendering, and written to the software at the end + * of rendering (wither when all tiles are finished, or when rendering was requested to be + * cancelled). + * + * Important thing is: tile should be written to the software via callback only once. */ + if (!has_multiple_tiles) { + VLOG(3) << "Write tile result via buffer write callback."; + tile_buffer_write(); + } + + /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile. + */ + if (has_multiple_tiles) { + VLOG(3) << "Write tile result into ."; + tile_buffer_write_to_disk(); + } +} + +void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work) +{ + if (!render_work.full.write) { + return; + } + + VLOG(3) << "Handle full-frame render buffer work."; + + if (!tile_manager_.has_written_tiles()) { + VLOG(3) << "No tiles on disk."; + return; + } + + /* Make sure writing to the file is fully finished. + * This will include writing all possible missing tiles, ensuring validness of the file. */ + tile_manager_.finish_write_tiles(); + + /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after + * all scenes and layers are rendered by the Session (which happens after freeing Session memory, + * so that we never hold scene and full-frame buffer in memory at the same time). */ +} + +void PathTrace::cancel() +{ + thread_scoped_lock lock(render_cancel_.mutex); + + render_cancel_.is_requested = true; + + while (render_cancel_.is_rendering) { + render_cancel_.condition.wait(lock); + } + + render_cancel_.is_requested = false; +} + +int PathTrace::get_num_samples_in_buffer() +{ + return render_scheduler_.get_num_rendered_samples(); +} + +bool PathTrace::is_cancel_requested() +{ + if (render_cancel_.is_requested) { + return true; + } + + if (progress_ != nullptr) { + if (progress_->get_cancel()) { + return true; + } + } + + return false; +} + +void PathTrace::tile_buffer_write() +{ + if (!tile_buffer_write_cb) { + return; + } + + tile_buffer_write_cb(); +} + +void PathTrace::tile_buffer_read() +{ + if (!tile_buffer_read_cb) { + return; + } + + if (tile_buffer_read_cb()) { + tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_render_buffers_to_device(); + }); + } +} + +void PathTrace::tile_buffer_write_to_disk() +{ + /* Sample count pass is required to support per-tile partial results stored in the file. */ + DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED); + + const int num_rendered_samples = render_scheduler_.get_num_rendered_samples(); + + if (num_rendered_samples == 0) { + /* The tile has zero samples, no need to write it. */ + return; + } + + /* Get access to the CPU-side render buffers of the current big tile. */ + RenderBuffers *buffers; + RenderBuffers big_tile_cpu_buffers(cpu_device_.get()); + + if (path_trace_works_.size() == 1) { + path_trace_works_[0]->copy_render_buffers_from_device(); + buffers = path_trace_works_[0]->get_render_buffers(); + } + else { + big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params); + copy_to_render_buffers(&big_tile_cpu_buffers); + + buffers = &big_tile_cpu_buffers; + } + + if (!tile_manager_.write_tile(*buffers)) { + LOG(ERROR) << "Error writing tile to file."; + } +} + +void PathTrace::progress_update_if_needed(const RenderWork &render_work) +{ + if (progress_ != nullptr) { + const int2 tile_size = get_render_tile_size(); + const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples; + const int current_sample = render_work.path_trace.start_sample + + render_work.path_trace.num_samples; + progress_->add_samples(num_samples_added, current_sample); + } + + if (progress_update_cb) { + progress_update_cb(); + } +} + +void PathTrace::progress_set_status(const string &status, const string &substatus) +{ + if (progress_ != nullptr) { + progress_->set_status(status, substatus); + } +} + +void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers) +{ + tbb::parallel_for_each(path_trace_works_, + [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_to_render_buffers(render_buffers); + }); + render_buffers->copy_to_device(); +} + +void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers) +{ + render_buffers->copy_from_device(); + tbb::parallel_for_each(path_trace_works_, + [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) { + path_trace_work->copy_from_render_buffers(render_buffers); + }); +} + +bool PathTrace::copy_render_tile_from_device() +{ + if (full_frame_state_.render_buffers) { + /* Full-frame buffer is always allocated on CPU. */ + return true; + } + + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->copy_render_buffers_from_device()) { + success = false; + } + }); + + return success; +} + +static string get_layer_view_name(const RenderBuffers &buffers) +{ + string result; + + if (buffers.params.layer.size()) { + result += string(buffers.params.layer); + } + + if (buffers.params.view.size()) { + if (!result.empty()) { + result += ", "; + } + result += string(buffers.params.view); + } + + return result; +} + +void PathTrace::process_full_buffer_from_disk(string_view filename) +{ + VLOG(3) << "Processing full frame buffer file " << filename; + + progress_set_status("Reading full buffer from disk"); + + RenderBuffers full_frame_buffers(cpu_device_.get()); + + DenoiseParams denoise_params; + if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) { + LOG(ERROR) << "Error reading tiles from file."; + return; + } + + const string layer_view_name = get_layer_view_name(full_frame_buffers); + + render_state_.has_denoised_result = false; + + if (denoise_params.use) { + progress_set_status(layer_view_name, "Denoising"); + + /* Re-use the denoiser as much as possible, avoiding possible device re-initialization. + * + * It will not conflict with the regular rendering as: + * - Rendering is supposed to be finished here. + * - The next rendering will go via Session's `run_update_for_next_iteration` which will + * ensure proper denoiser is used. */ + set_denoiser_params(denoise_params); + + /* Number of samples doesn't matter too much, since the sampels count pass will be used. */ + denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false); + + render_state_.has_denoised_result = true; + } + + full_frame_state_.render_buffers = &full_frame_buffers; + + progress_set_status(layer_view_name, "Finishing"); + + /* Write the full result pretending that there is a single tile. + * Requires some state change, but allows to use same communication API with the software. */ + tile_buffer_write(); + + full_frame_state_.render_buffers = nullptr; +} + +int PathTrace::get_num_render_tile_samples() const +{ + if (full_frame_state_.render_buffers) { + /* If the full-frame buffer is read from disk the number of samples is not used as there is a + * sample count pass for that in the buffer. Just avoid access to badly defined state of the + * path state. */ + return 0; + } + + return render_scheduler_.get_num_rendered_samples(); +} + +bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination) +{ + if (full_frame_state_.render_buffers) { + return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination); + } + + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) { + success = false; + } + }); + + return success; +} + +bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor, + const PassAccessor::Source &source) +{ + bool success = true; + + tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) { + if (!success) { + return; + } + if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) { + success = false; + } + }); + + return success; +} + +int2 PathTrace::get_render_tile_size() const +{ + if (full_frame_state_.render_buffers) { + return make_int2(full_frame_state_.render_buffers->params.width, + full_frame_state_.render_buffers->params.height); + } + + const Tile &tile = tile_manager_.get_current_tile(); + return make_int2(tile.width, tile.height); +} + +int2 PathTrace::get_render_tile_offset() const +{ + if (full_frame_state_.render_buffers) { + return make_int2(0, 0); + } + + const Tile &tile = tile_manager_.get_current_tile(); + return make_int2(tile.x, tile.y); +} + +const BufferParams &PathTrace::get_render_tile_params() const +{ + if (full_frame_state_.render_buffers) { + return full_frame_state_.render_buffers->params; + } + + return big_tile_params_; +} + +bool PathTrace::has_denoised_result() const +{ + return render_state_.has_denoised_result; +} + +/* -------------------------------------------------------------------- + * Report generation. + */ + +static const char *device_type_for_description(const DeviceType type) +{ + switch (type) { + case DEVICE_NONE: + return "None"; + + case DEVICE_CPU: + return "CPU"; + case DEVICE_CUDA: + return "CUDA"; + case DEVICE_OPTIX: + return "OptiX"; + case DEVICE_DUMMY: + return "Dummy"; + case DEVICE_MULTI: + return "Multi"; + } + + return "UNKNOWN"; +} + +/* Construct description of the device which will appear in the full report. */ +/* TODO(sergey): Consider making it more reusable utility. */ +static string full_device_info_description(const DeviceInfo &device_info) +{ + string full_description = device_info.description; + + full_description += " (" + string(device_type_for_description(device_info.type)) + ")"; + + if (device_info.display_device) { + full_description += " (display)"; + } + + if (device_info.type == DEVICE_CPU) { + full_description += " (" + to_string(device_info.cpu_threads) + " threads)"; + } + + full_description += " [" + device_info.id + "]"; + + return full_description; +} + +/* Construct string which will contain information about devices, possibly multiple of the devices. + * + * In the simple case the result looks like: + * + * Message: Full Device Description + * + * If there are multiple devices then the result looks like: + * + * Message: Full First Device Description + * Full Second Device Description + * + * Note that the newlines are placed in a way so that the result can be easily concatenated to the + * full report. */ +static string device_info_list_report(const string &message, const DeviceInfo &device_info) +{ + string result = "\n" + message + ": "; + const string pad(message.length() + 2, ' '); + + if (device_info.multi_devices.empty()) { + result += full_device_info_description(device_info) + "\n"; + return result; + } + + bool is_first = true; + for (const DeviceInfo &sub_device_info : device_info.multi_devices) { + if (!is_first) { + result += pad; + } + + result += full_device_info_description(sub_device_info) + "\n"; + + is_first = false; + } + + return result; +} + +static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works) +{ + DeviceInfo device_info; + device_info.type = DEVICE_MULTI; + + for (auto &&path_trace_work : path_trace_works) { + device_info.multi_devices.push_back(path_trace_work->get_device()->info); + } + + return device_info_list_report("Path tracing on", device_info); +} + +static string denoiser_device_report(const Denoiser *denoiser) +{ + if (!denoiser) { + return ""; + } + + if (!denoiser->get_params().use) { + return ""; + } + + const Device *denoiser_device = denoiser->get_denoiser_device(); + if (!denoiser_device) { + return ""; + } + + return device_info_list_report("Denoising on", denoiser_device->info); +} + +string PathTrace::full_report() const +{ + string result = "\nFull path tracing report\n"; + + result += path_trace_devices_report(path_trace_works_); + result += denoiser_device_report(denoiser_.get()); + + /* Report from the render scheduler, which includes: + * - Render mode (interactive, offline, headless) + * - Adaptive sampling and denoiser parameters + * - Breakdown of timing. */ + result += render_scheduler_.full_report(); + + return result; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h new file mode 100644 index 00000000000..78ca68c1198 --- /dev/null +++ b/intern/cycles/integrator/path_trace.h @@ -0,0 +1,324 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/denoiser.h" +#include "integrator/pass_accessor.h" +#include "integrator/path_trace_work.h" +#include "integrator/work_balancer.h" +#include "render/buffers.h" +#include "util/util_function.h" +#include "util/util_thread.h" +#include "util/util_unique_ptr.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class AdaptiveSampling; +class Device; +class DeviceScene; +class Film; +class RenderBuffers; +class RenderScheduler; +class RenderWork; +class Progress; +class GPUDisplay; +class TileManager; + +/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of + * all the common steps of path tracing which are not device-specific. The list of tasks includes + * but is not limited to: + * - Kernel graph. + * - Scheduling logic. + * - Queues management. + * - Adaptive stopping. */ +class PathTrace { + public: + /* Render scheduler is used to report timing information and access things like start/finish + * sample. */ + PathTrace(Device *device, + Film *film, + DeviceScene *device_scene, + RenderScheduler &render_scheduler, + TileManager &tile_manager); + ~PathTrace(); + + /* Create devices and load kernels which are created on-demand (for example, denoising devices). + * The progress is reported to the currently configure progress object (via `set_progress`). */ + void load_kernels(); + + /* Allocate working memory. This runs before allocating scene memory so that we can estimate + * more accurately which scene device memory may need to allocated on the host. */ + void alloc_work_memory(); + + /* Check whether now it is a good time to reset rendering. + * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate + * render result. */ + bool ready_to_reset(); + + void reset(const BufferParams &full_params, const BufferParams &big_tile_params); + + void device_free(); + + /* Set progress tracker. + * Used to communicate details about the progress to the outer world, check whether rendering is + * to be canceled. + * + * The path tracer writes to this object, and then at a convenient moment runs + * progress_update_cb() callback. */ + void set_progress(Progress *progress); + + /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are + * rendered (or until rendering is requested to be cancelled). */ + void render(const RenderWork &render_work); + + /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is + * convenient to have it here because then its easy to access render buffer. But the downside is + * that this adds too much of entities which can live separately with some clear API. */ + + /* Set denoiser parameters. + * Use this to configure the denoiser before rendering any samples. */ + void set_denoiser_params(const DenoiseParams ¶ms); + + /* Set parameters used for adaptive sampling. + * Use this to configure the adaptive sampler before rendering any samples. */ + void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling); + + /* Set GPU display which takes care of drawing the render result. */ + void set_gpu_display(unique_ptr<GPUDisplay> gpu_display); + + /* Clear the GPU display by filling it in with all zeroes. */ + void clear_gpu_display(); + + /* Perform drawing of the current state of the GPUDisplay. */ + void draw(); + + /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled. + * Used in cases like reset of render session. + * + * This is a blockign call, which returns as soon as there is no running `render_samples()` call. + */ + void cancel(); + + /* Copy an entire render buffer to/from the path trace. */ + + /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and + * the data will be copied to the device of the given render buffers. */ + void copy_to_render_buffers(RenderBuffers *render_buffers); + + /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr + * buffers and will be copied to all devices of the path trace. */ + void copy_from_render_buffers(RenderBuffers *render_buffers); + + /* Copy render buffers of the big tile from the device to hsot. + * Return true if all copies are successful. */ + bool copy_render_tile_from_device(); + + /* Read given full-frame file from disk, perform needed processing and write it to the software + * via the write callback. */ + void process_full_buffer_from_disk(string_view filename); + + /* Get number of samples in the current big tile render buffers. */ + int get_num_render_tile_samples() const; + + /* Get pass data of the entire big tile. + * This call puts pass render result from all devices into the final pixels storage. + * + * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`. + * + * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */ + bool get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination); + + /* Set pass data for baking. */ + bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source); + + /* Check whether denoiser was run and denoised passes are available. */ + bool has_denoised_result() const; + + /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. + * In the case of tiled rendering this will return full-frame after all tiles has been rendered. + * + * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame + * instead. */ + int2 get_render_tile_size() const; + int2 get_render_tile_offset() const; + + /* Get buffer parameters of the current tile. + * + * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame + * instead. */ + const BufferParams &get_render_tile_params() const; + + /* Generate full multi-line report of the rendering process, including rendering parameters, + * times, and so on. */ + string full_report() const; + + /* Callback which communicates an updates state of the render buffer of the current big tile. + * Is called during path tracing to communicate work-in-progress state of the final buffer. */ + function<void(void)> tile_buffer_update_cb; + + /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */ + function<void(void)> tile_buffer_write_cb; + + /* Callback which initializes rendered buffer. Is called before pathtracing starts. + * + * This is used for baking. */ + function<bool(void)> tile_buffer_read_cb; + + /* Callback which is called to report current rendering progress. + * + * It is supposed to be cheaper than buffer update/write, hence can be called more often. + * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed + * that the buffer is "uniformly" sampled at the moment of this callback). */ + function<void(void)> progress_update_cb; + + protected: + /* Actual implementation of the rendering pipeline. + * Calls steps in order, checking for the cancel to be requested inbetween. + * + * Is separate from `render()` to simplify dealing with the early outputs and keeping + * `render_cancel_` in the consistent state. */ + void render_pipeline(RenderWork render_work); + + /* Initialize kernel execution on all integrator queues. */ + void render_init_kernel_execution(); + + /* Make sure both allocated and effective buffer parameters of path tracer works are up to date + * with the current big tile parameters, performance-dependent slicing, and resolution divider. + */ + void update_work_buffer_params_if_needed(const RenderWork &render_work); + void update_allocated_work_buffer_params(); + void update_effective_work_buffer_params(const RenderWork &render_work); + + /* Perform various steps of the render work. + * + * Note that some steps might modify the work, forcing some steps to happen within this iteration + * of rendering. */ + void init_render_buffers(const RenderWork &render_work); + void path_trace(RenderWork &render_work); + void adaptive_sample(RenderWork &render_work); + void denoise(const RenderWork &render_work); + void cryptomatte_postprocess(const RenderWork &render_work); + void update_display(const RenderWork &render_work); + void rebalance(const RenderWork &render_work); + void write_tile_buffer(const RenderWork &render_work); + void finalize_full_buffer_on_disk(const RenderWork &render_work); + + /* Get number of samples in the current state of the render buffers. */ + int get_num_samples_in_buffer(); + + /* Check whether user requested to cancel rendering, so that path tracing is to be finished as + * soon as possible. */ + bool is_cancel_requested(); + + /* Write the big tile render buffer via the write callback. */ + void tile_buffer_write(); + + /* Read the big tile render buffer via the read callback. */ + void tile_buffer_read(); + + /* Write current tile into the file on disk. */ + void tile_buffer_write_to_disk(); + + /* Run the progress_update_cb callback if it is needed. */ + void progress_update_if_needed(const RenderWork &render_work); + + void progress_set_status(const string &status, const string &substatus = ""); + + /* Pointer to a device which is configured to be used for path tracing. If multiple devices + * are configured this is a `MultiDevice`. */ + Device *device_ = nullptr; + + /* CPU device for creating temporary render buffers on the CPU side. */ + unique_ptr<Device> cpu_device_; + + DeviceScene *device_scene_; + + RenderScheduler &render_scheduler_; + TileManager &tile_manager_; + + unique_ptr<GPUDisplay> gpu_display_; + + /* Per-compute device descriptors of work which is responsible for path tracing on its configured + * device. */ + vector<unique_ptr<PathTraceWork>> path_trace_works_; + + /* Per-path trace work information needed for multi-device balancing. */ + vector<WorkBalanceInfo> work_balance_infos_; + + /* Render buffer parameters of the full frame and current big tile. */ + BufferParams full_params_; + BufferParams big_tile_params_; + + /* Denoiser which takes care of denoising the big tile. */ + unique_ptr<Denoiser> denoiser_; + + /* State which is common for all the steps of the render work. + * Is brought up to date in the `render()` call and is accessed from all the steps involved into + * rendering the work. */ + struct { + /* Denotes whether render buffers parameters of path trace works are to be reset for the new + * value of the big tile parameters. */ + bool need_reset_params = false; + + /* Divider of the resolution for faster previews. + * + * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to + * think of render buffer in this case is as an over-allocated array: the resolution divider + * affects both resolution and stride as visible by the integrator kernels. */ + int resolution_divider = 0; + + /* Paramaters of the big tile with the current resolution divider applied. */ + BufferParams effective_big_tile_params; + + /* Denosier was run and there are denoised versions of the passes in the render buffers. */ + bool has_denoised_result = false; + + /* Current tile has been written (to either disk or callback. + * Indicates that no more work will be done on this tile. */ + bool tile_written = false; + } render_state_; + + /* Progress object which is used to communicate sample progress. */ + Progress *progress_; + + /* Fields required for canceling render on demand, as quickly as possible. */ + struct { + /* Indicates whether there is an on-going `render_samples()` call. */ + bool is_rendering = false; + + /* Indicates whether rendering is requested to be canceled by `cancel()`. */ + bool is_requested = false; + + /* Synchronization between thread which does `render_samples()` and thread which does + * `cancel()`. */ + thread_mutex mutex; + thread_condition_variable condition; + } render_cancel_; + + /* Indicates whether a render result was drawn after latest session reset. + * Used by `ready_to_reset()` to implement logic which feels the most interactive. */ + bool did_draw_after_reset_ = true; + + /* State of the full frame processing and writing to the software. */ + struct { + RenderBuffers *render_buffers = nullptr; + } full_frame_state_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp new file mode 100644 index 00000000000..d9634acac10 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work.cpp @@ -0,0 +1,203 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device.h" + +#include "integrator/path_trace_work.h" +#include "integrator/path_trace_work_cpu.h" +#include "integrator/path_trace_work_gpu.h" +#include "render/buffers.h" +#include "render/film.h" +#include "render/gpu_display.h" +#include "render/scene.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +unique_ptr<PathTraceWork> PathTraceWork::create(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) +{ + if (device->info.type == DEVICE_CPU) { + return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag); + } + + return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag); +} + +PathTraceWork::PathTraceWork(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : device_(device), + film_(film), + device_scene_(device_scene), + buffers_(make_unique<RenderBuffers>(device)), + effective_buffer_params_(buffers_->params), + cancel_requested_flag_(cancel_requested_flag) +{ +} + +PathTraceWork::~PathTraceWork() +{ +} + +RenderBuffers *PathTraceWork::get_render_buffers() +{ + return buffers_.get(); +} + +void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params, + const BufferParams &effective_big_tile_params, + const BufferParams &effective_buffer_params) +{ + effective_full_params_ = effective_full_params; + effective_big_tile_params_ = effective_big_tile_params; + effective_buffer_params_ = effective_buffer_params; +} + +bool PathTraceWork::has_multiple_works() const +{ + /* Assume if there are multiple works working on the same big tile none of the works gets the + * entire big tile to work on. */ + return !(effective_big_tile_params_.width == effective_buffer_params_.width && + effective_big_tile_params_.height == effective_buffer_params_.height && + effective_big_tile_params_.full_x == effective_buffer_params_.full_x && + effective_big_tile_params_.full_y == effective_buffer_params_.full_y); +} + +void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers) +{ + copy_render_buffers_from_device(); + + const int64_t width = effective_buffer_params_.width; + const int64_t height = effective_buffer_params_.height; + const int64_t pass_stride = effective_buffer_params_.pass_stride; + const int64_t row_stride = width * pass_stride; + const int64_t data_size = row_stride * height * sizeof(float); + + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset_in_floats = offset_y * row_stride; + + const float *src = buffers_->buffer.data(); + float *dst = render_buffers->buffer.data() + offset_in_floats; + + memcpy(dst, src, data_size); +} + +void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers) +{ + const int64_t width = effective_buffer_params_.width; + const int64_t height = effective_buffer_params_.height; + const int64_t pass_stride = effective_buffer_params_.pass_stride; + const int64_t row_stride = width * pass_stride; + const int64_t data_size = row_stride * height * sizeof(float); + + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset_in_floats = offset_y * row_stride; + + const float *src = render_buffers->buffer.data() + offset_in_floats; + float *dst = buffers_->buffer.data(); + + memcpy(dst, src, data_size); + + copy_render_buffers_to_device(); +} + +void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers) +{ + const int64_t width = effective_buffer_params_.width; + const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int64_t offset = offset_y * width; + + render_buffers_host_copy_denoised( + buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset); + + copy_render_buffers_to_device(); +} + +bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination) +{ + const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int width = effective_buffer_params_.width; + + PassAccessor::Destination slice_destination = destination; + slice_destination.offset += offset_y * width; + + return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination); +} + +bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor, + const PassAccessor::Source &source) +{ + const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y; + const int width = effective_buffer_params_.width; + + PassAccessor::Source slice_source = source; + slice_source.offset += offset_y * width; + + return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source); +} + +PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const +{ + const KernelFilm &kfilm = device_scene_->data.film; + const KernelBackground &kbackground = device_scene_->data.background; + + const BufferParams ¶ms = buffers_->params; + + const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass()); + + PassAccessor::PassAccessInfo pass_access_info; + pass_access_info.type = display_pass->type; + pass_access_info.offset = PASS_UNUSED; + + if (pass_mode == PassMode::DENOISED) { + pass_access_info.mode = PassMode::DENOISED; + pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED); + } + + if (pass_access_info.offset == PASS_UNUSED) { + pass_access_info.mode = PassMode::NOISY; + pass_access_info.offset = params.get_pass_offset(pass_access_info.type); + } + + pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher; + pass_access_info.use_approximate_shadow_catcher_background = + kfilm.use_approximate_shadow_catcher && !kbackground.transparent; + + return pass_access_info; +} + +PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template( + const GPUDisplay *gpu_display) const +{ + PassAccessor::Destination destination(film_->get_display_pass()); + + const int2 display_texture_size = gpu_display->get_texture_size(); + const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x; + const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y; + + destination.offset = texture_y * display_texture_size.x + texture_x; + destination.stride = display_texture_size.x; + + return destination; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h new file mode 100644 index 00000000000..97b97f3d888 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work.h @@ -0,0 +1,194 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/pass_accessor.h" +#include "render/buffers.h" +#include "render/pass.h" +#include "util/util_types.h" +#include "util/util_unique_ptr.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; +class Device; +class DeviceScene; +class Film; +class GPUDisplay; +class RenderBuffers; + +class PathTraceWork { + public: + struct RenderStatistics { + float occupancy = 1.0f; + }; + + /* Create path trace work which fits best the device. + * + * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as + * possible. This could be, for rexample, request to cancel rendering on camera navigation in + * viewport. */ + static unique_ptr<PathTraceWork> create(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual ~PathTraceWork(); + + /* Access the render buffers. + * + * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to + * correspond to the big tile size and relative device performance. */ + RenderBuffers *get_render_buffers(); + + /* Set effective parameters of the big tile and the work itself. */ + void set_effective_buffer_params(const BufferParams &effective_full_params, + const BufferParams &effective_big_tile_params, + const BufferParams &effective_buffer_params); + + /* Check whether the big tile is being worked on by multiple path trace works. */ + bool has_multiple_works() const; + + /* Allocate working memory for execution. Must be called before init_execution(). */ + virtual void alloc_work_memory(){}; + + /* Initialize execution of kernels. + * Will ensure that all device queues are initialized for execution. + * + * This method is to be called after any change in the scene. It is not needed to call it prior + * to an every call of the `render_samples()`. */ + virtual void init_execution() = 0; + + /* Render given number of samples as a synchronous blocking call. + * The samples are added to the render buffer associated with this work. */ + virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; + + /* Copy render result from this work to the corresponding place of the GPU display. + * + * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The + * noisy pass mode will be passed here when it is known that the buffer does not have denoised + * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is + * not used then this function will fall-back to the noisy pass instead. */ + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) = 0; + + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0; + + /* Copy data from/to given render buffers. + * Will copy pixels from a corresponding place (from multi-device point of view) of the render + * buffers, and copy work's render buffers to the corresponding place of the destination. */ + + /* Notes: + * - Copies work's render buffer from the device. + * - Copies CPU-side buffer of the given buffer + * - Does not copy the buffer to its device. */ + void copy_to_render_buffers(RenderBuffers *render_buffers); + + /* Notes: + * - Does not copy given render buffers from the device. + * - Copies work's render buffer to its device. */ + void copy_from_render_buffers(const RenderBuffers *render_buffers); + + /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the + * given render buffers, leaving rest of the passes. + * + * Same notes about device copying aplies to this call as well. */ + void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers); + + /* Copy render buffers to/from device using an appropriate device queue when needed so that + * things are executed in order with the `render_samples()`. */ + virtual bool copy_render_buffers_from_device() = 0; + virtual bool copy_render_buffers_to_device() = 0; + + /* Zero render buffers to/from device using an appropriate device queue when needed so that + * things are executed in order with the `render_samples()`. */ + virtual bool zero_render_buffers() = 0; + + /* Access pixels rendered by this work and copy them to the coresponding location in the + * destination. + * + * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()` + * to update host-side data. */ + bool get_render_tile_pixels(const PassAccessor &pass_accessor, + const PassAccessor::Destination &destination); + + /* Set pass data for baking. */ + bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source); + + /* Perform convergence test on the render buffer, and filter the convergence mask. + * Returns number of active pixels (the ones which did not converge yet). */ + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0; + + /* Run cryptomatte pass post-processing kernels. */ + virtual void cryptomatte_postproces() = 0; + + /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as + * possible, without waiting for any samples to be finished. */ + inline bool is_cancel_requested() const + { + /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in + * threaded environment. */ + return *cancel_requested_flag_; + } + + /* Access to the device which is used to path trace this work on. */ + Device *get_device() const + { + return device_; + } + + protected: + PathTraceWork(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const; + + /* Get destination which offset and stride are configured so that writing to it will write to a + * proper location of GPU display texture, taking current tile and device slice into account. */ + PassAccessor::Destination get_gpu_display_destination_template( + const GPUDisplay *gpu_display) const; + + /* Device which will be used for path tracing. + * Note that it is an actual render device (and never is a multi-device). */ + Device *device_; + + /* Film is used to access display pass configuration for GPU display update. + * Note that only fields which are not a part of kernel data can be accessed via the Film. */ + Film *film_; + + /* Device side scene storage, that may be used for integrator logic. */ + DeviceScene *device_scene_; + + /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big + * tile which is being rendered by this work. + * It also defines possible subset of a big tile in the case of multi-device rendering. */ + unique_ptr<RenderBuffers> buffers_; + + /* Effective parameters of the full, big tile, and current work render buffer. + * The latter might be different from buffers_->params when there is a resolution divider + * involved. */ + BufferParams effective_full_params_; + BufferParams effective_big_tile_params_; + BufferParams effective_buffer_params_; + + bool *cancel_requested_flag_ = nullptr; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp new file mode 100644 index 00000000000..b9a33b64051 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -0,0 +1,281 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_work_cpu.h" + +#include "device/cpu/kernel.h" +#include "device/device.h" + +#include "integrator/pass_accessor_cpu.h" + +#include "render/buffers.h" +#include "render/gpu_display.h" +#include "render/scene.h" + +#include "util/util_atomic.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" + +CCL_NAMESPACE_BEGIN + +/* Create TBB arena for execution of path tracing and rendering tasks. */ +static inline tbb::task_arena local_tbb_arena_create(const Device *device) +{ + /* TODO: limit this to number of threads of CPU device, it may be smaller than + * the system number of threads when we reduce the number of CPU threads in + * CPU + GPU rendering to dedicate some cores to handling the GPU device. */ + return tbb::task_arena(device->info.cpu_threads); +} + +/* Get CPUKernelThreadGlobals for the current thread. */ +static inline CPUKernelThreadGlobals *kernel_thread_globals_get( + vector<CPUKernelThreadGlobals> &kernel_thread_globals) +{ + const int thread_index = tbb::this_task_arena::current_thread_index(); + DCHECK_GE(thread_index, 0); + DCHECK_LE(thread_index, kernel_thread_globals.size()); + + return &kernel_thread_globals[thread_index]; +} + +PathTraceWorkCPU::PathTraceWorkCPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : PathTraceWork(device, film, device_scene, cancel_requested_flag), + kernels_(*(device->get_cpu_kernels())) +{ + DCHECK_EQ(device->info.type, DEVICE_CPU); +} + +void PathTraceWorkCPU::init_execution() +{ + /* Cache per-thread kernel globals. */ + device_->get_cpu_kernel_thread_globals(kernel_thread_globals_); +} + +void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) +{ + const int64_t image_width = effective_buffer_params_.width; + const int64_t image_height = effective_buffer_params_.height; + const int64_t total_pixels_num = image_width * image_height; + + for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { + kernel_globals.start_profiling(); + } + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + local_arena.execute([&]() { + tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) { + if (is_cancel_requested()) { + return; + } + + const int y = work_index / image_width; + const int x = work_index - y * image_width; + + KernelWorkTile work_tile; + work_tile.x = effective_buffer_params_.full_x + x; + work_tile.y = effective_buffer_params_.full_y + y; + work_tile.w = 1; + work_tile.h = 1; + work_tile.start_sample = start_sample; + work_tile.num_samples = 1; + work_tile.offset = effective_buffer_params_.offset; + work_tile.stride = effective_buffer_params_.stride; + + CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_); + + render_samples_full_pipeline(kernel_globals, work_tile, samples_num); + }); + }); + + for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { + kernel_globals.stop_profiling(); + } + + statistics.occupancy = 1.0f; +} + +void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals, + const KernelWorkTile &work_tile, + const int samples_num) +{ + const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher; + const bool has_bake = device_scene_->data.bake.use; + + IntegratorStateCPU integrator_states[2] = {}; + + IntegratorStateCPU *state = &integrator_states[0]; + IntegratorStateCPU *shadow_catcher_state = &integrator_states[1]; + + KernelWorkTile sample_work_tile = work_tile; + float *render_buffer = buffers_->buffer.data(); + + for (int sample = 0; sample < samples_num; ++sample) { + if (is_cancel_requested()) { + break; + } + + if (has_bake) { + if (!kernels_.integrator_init_from_bake( + kernel_globals, state, &sample_work_tile, render_buffer)) { + break; + } + } + else { + if (!kernels_.integrator_init_from_camera( + kernel_globals, state, &sample_work_tile, render_buffer)) { + break; + } + } + + kernels_.integrator_megakernel(kernel_globals, state, render_buffer); + + if (has_shadow_catcher) { + kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer); + } + + ++sample_work_tile.start_sample; + } +} + +void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + half4 *rgba_half = gpu_display->map_texture_buffer(); + if (!rgba_half) { + /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for + * some implementations of GPUDisplay which can not map memory? */ + return; + } + + const KernelFilm &kfilm = device_scene_->data.film; + + const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode); + + const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples); + + PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + destination.pixels_half_rgba = rgba_half; + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + local_arena.execute([&]() { + pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); + }); + + gpu_display->unmap_texture_buffer(); +} + +void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/) +{ +} + +bool PathTraceWorkCPU::copy_render_buffers_from_device() +{ + return buffers_->copy_from_device(); +} + +bool PathTraceWorkCPU::copy_render_buffers_to_device() +{ + buffers_->buffer.copy_to_device(); + return true; +} + +bool PathTraceWorkCPU::zero_render_buffers() +{ + buffers_->zero(); + return true; +} + +int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset) +{ + const int full_x = effective_buffer_params_.full_x; + const int full_y = effective_buffer_params_.full_y; + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + const int offset = effective_buffer_params_.offset; + const int stride = effective_buffer_params_.stride; + + float *render_buffer = buffers_->buffer.data(); + + uint num_active_pixels = 0; + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + + /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */ + local_arena.execute([&]() { + tbb::parallel_for(full_y, full_y + height, [&](int y) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + + bool row_converged = true; + uint num_row_pixels_active = 0; + for (int x = 0; x < width; ++x) { + if (!kernels_.adaptive_sampling_convergence_check( + kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) { + ++num_row_pixels_active; + row_converged = false; + } + } + + atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active); + + if (!row_converged) { + kernels_.adaptive_sampling_filter_x( + kernel_globals, render_buffer, y, full_x, width, offset, stride); + } + }); + }); + + if (num_active_pixels) { + local_arena.execute([&]() { + tbb::parallel_for(full_x, full_x + width, [&](int x) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + kernels_.adaptive_sampling_filter_y( + kernel_globals, render_buffer, x, full_y, height, offset, stride); + }); + }); + } + + return num_active_pixels; +} + +void PathTraceWorkCPU::cryptomatte_postproces() +{ + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + + float *render_buffer = buffers_->buffer.data(); + + tbb::task_arena local_arena = local_tbb_arena_create(device_); + + /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */ + local_arena.execute([&]() { + tbb::parallel_for(0, height, [&](int y) { + CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0]; + int pixel_index = y * width; + + for (int x = 0; x < width; ++x, ++pixel_index) { + kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index); + } + }); + }); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h new file mode 100644 index 00000000000..ab729bbf879 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state.h" + +#include "device/cpu/kernel_thread_globals.h" +#include "device/device_queue.h" + +#include "integrator/path_trace_work.h" + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct KernelWorkTile; +struct KernelGlobals; + +class CPUKernels; + +/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel, + * for CPU devices. + * + * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent + * queues on the render device which makes this work be only usable on CPU. */ +class PathTraceWorkCPU : public PathTraceWork { + public: + PathTraceWorkCPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual void init_execution() override; + + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; + + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + + virtual bool copy_render_buffers_from_device() override; + virtual bool copy_render_buffers_to_device() override; + virtual bool zero_render_buffers() override; + + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override; + virtual void cryptomatte_postproces() override; + + protected: + /* Core path tracing routine. Renders given work time on the given queue. */ + void render_samples_full_pipeline(KernelGlobals *kernel_globals, + const KernelWorkTile &work_tile, + const int samples_num); + + /* CPU kernels. */ + const CPUKernels &kernels_; + + /* Copy of kernel globals which is suitable for concurrent access from multiple threads. + * + * More specifically, the `kernel_globals_` is local to each threads and nobody else is + * accessing it, but some "localization" is required to decouple from kernel globals stored + * on the device level. */ + vector<CPUKernelThreadGlobals> kernel_thread_globals_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp new file mode 100644 index 00000000000..10baf869aa6 --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -0,0 +1,933 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_work_gpu.h" + +#include "device/device.h" + +#include "integrator/pass_accessor_gpu.h" +#include "render/buffers.h" +#include "render/gpu_display.h" +#include "render/scene.h" +#include "util/util_logging.h" +#include "util/util_tbb.h" +#include "util/util_time.h" + +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +PathTraceWorkGPU::PathTraceWorkGPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag) + : PathTraceWork(device, film, device_scene, cancel_requested_flag), + queue_(device->gpu_queue_create()), + integrator_state_soa_kernel_features_(0), + integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE), + integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE), + integrator_shader_raytrace_sort_counter_( + device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE), + integrator_next_shadow_catcher_path_index_( + device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE), + queued_paths_(device, "queued_paths", MEM_READ_WRITE), + num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), + work_tiles_(device, "work_tiles", MEM_READ_WRITE), + gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), + max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))), + min_num_active_paths_(queue_->num_concurrent_busy_states()), + max_active_path_index_(0) +{ + memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); + + /* Limit number of active paths to the half of the overall state. This is due to the logic in the + * path compaction which relies on the fact that regeneration does not happen sooner than half of + * the states are available again. */ + min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2); +} + +void PathTraceWorkGPU::alloc_integrator_soa() +{ + /* IntegrateState allocated as structure of arrays. */ + + /* Check if we already allocated memory for the required features. */ + const uint kernel_features = device_scene_->data.kernel_features; + if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) { + return; + } + integrator_state_soa_kernel_features_ = kernel_features; + + /* Allocate a device only memory buffer before for each struct member, and then + * write the pointers into a struct that resides in constant memory. + * + * TODO: store float3 in separate XYZ arrays. */ +#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if ((kernel_features & feature) && \ + (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_END(name) \ + break; \ + } +#define KERNEL_STRUCT_END_ARRAY(name, array_size) \ + if (array_index == array_size - 1) { \ + break; \ + } \ + } +#include "kernel/integrator/integrator_state_template.h" +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER +#undef KERNEL_STRUCT_END +#undef KERNEL_STRUCT_END_ARRAY +} + +void PathTraceWorkGPU::alloc_integrator_queue() +{ + if (integrator_queue_counter_.size() == 0) { + integrator_queue_counter_.alloc(1); + integrator_queue_counter_.zero_to_device(); + integrator_queue_counter_.copy_from_device(); + integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *) + integrator_queue_counter_.device_pointer; + } + + /* Allocate data for active path index arrays. */ + if (num_queued_paths_.size() == 0) { + num_queued_paths_.alloc(1); + num_queued_paths_.zero_to_device(); + } + + if (queued_paths_.size() == 0) { + queued_paths_.alloc(max_num_paths_); + /* TODO: this could be skip if we had a function to just allocate on device. */ + queued_paths_.zero_to_device(); + } +} + +void PathTraceWorkGPU::alloc_integrator_sorting() +{ + /* Allocate arrays for shader sorting. */ + const int max_shaders = device_scene_->data.max_shaders; + if (integrator_shader_sort_counter_.size() < max_shaders) { + integrator_shader_sort_counter_.alloc(max_shaders); + integrator_shader_sort_counter_.zero_to_device(); + + integrator_shader_raytrace_sort_counter_.alloc(max_shaders); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } +} + +void PathTraceWorkGPU::alloc_integrator_path_split() +{ + if (integrator_next_shadow_catcher_path_index_.size() != 0) { + return; + } + + integrator_next_shadow_catcher_path_index_.alloc(1); + /* TODO(sergey): Use queue? */ + integrator_next_shadow_catcher_path_index_.zero_to_device(); + + integrator_state_gpu_.next_shadow_catcher_path_index = + (int *)integrator_next_shadow_catcher_path_index_.device_pointer; +} + +void PathTraceWorkGPU::alloc_work_memory() +{ + alloc_integrator_soa(); + alloc_integrator_queue(); + alloc_integrator_sorting(); + alloc_integrator_path_split(); +} + +void PathTraceWorkGPU::init_execution() +{ + queue_->init_execution(); + + /* Copy to device side struct in constant memory. */ + device_->const_copy_to( + "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); +} + +void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) +{ + /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to + * add more work (because tiles are smaller, so there is higher chance that more paths will + * become busy after adding new tiles). This is especially important for the shadow catcher which + * schedules work in halves of available number of paths. */ + work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8); + + work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num); + + enqueue_reset(); + + int num_iterations = 0; + uint64_t num_busy_accum = 0; + + /* TODO: set a hard limit in case of undetected kernel failures? */ + while (true) { + /* Enqueue work from the scheduler, on start or when there are not enough + * paths to keep the device occupied. */ + bool finished; + if (enqueue_work_tiles(finished)) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + /* Stop if no more work remaining. */ + if (finished) { + break; + } + + /* Enqueue on of the path iteration kernels. */ + if (enqueue_path_iteration()) { + /* Copy stats from the device. */ + queue_->copy_from_device(integrator_queue_counter_); + + if (!queue_->synchronize()) { + break; /* Stop on error. */ + } + } + + if (is_cancel_requested()) { + break; + } + + num_busy_accum += get_num_active_paths(); + ++num_iterations; + } + + statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_; +} + +DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const +{ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int max_num_queued = 0; + DeviceKernel kernel = DEVICE_KERNEL_NUM; + + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + if (queue_counter->num_queued[i] > max_num_queued) { + kernel = (DeviceKernel)i; + max_num_queued = queue_counter->num_queued[i]; + } + } + + return kernel; +} + +void PathTraceWorkGPU::enqueue_reset() +{ + void *args[] = {&max_num_paths_}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); + queue_->zero_to_device(integrator_queue_counter_); + queue_->zero_to_device(integrator_shader_sort_counter_); + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + + /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the + * counter on the host side because `zero_to_device()` is not doing it. */ + if (integrator_queue_counter_.host_pointer) { + memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size()); + } +} + +bool PathTraceWorkGPU::enqueue_path_iteration() +{ + /* Find kernel to execute, with max number of queued paths. */ + const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_active_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + num_active_paths += queue_counter->num_queued[i]; + } + + if (num_active_paths == 0) { + return false; + } + + /* Find kernel to execute, with max number of queued paths. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel == DEVICE_KERNEL_NUM) { + return false; + } + + /* Finish shadows before potentially adding more shadow rays. We can only + * store one shadow ray in the integrator state. */ + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) { + if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + return true; + } + else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) { + enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return true; + } + } + + /* Schedule kernel with maximum number of queued items. */ + enqueue_path_iteration(kernel); + return true; +} + +void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel) +{ + void *d_path_index = (void *)NULL; + + /* Create array of path indices for which this kernel is queued to be executed. */ + int work_size = max_active_path_index_; + + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + int num_queued = queue_counter->num_queued[kernel]; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + /* Compute array of active paths, sorted by shader. */ + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel); + } + else if (num_queued < work_size) { + work_size = num_queued; + d_path_index = (void *)queued_paths_.device_pointer; + + if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Compute array of active shadow paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel); + } + else { + /* Compute array of active paths for specific kernel. */ + compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel); + } + } + + DCHECK_LE(work_size, max_num_paths_); + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { + /* Ray intersection kernels with integrator state. */ + void *args[] = {&d_path_index, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { + /* Shading kernels with integrator state and render buffer. */ + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)}; + + queue_->enqueue(kernel, work_size, args); + break; + } + + default: + LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel) + << " used for path iteration, should never happen."; + break; + } +} + +void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel]; + assert(d_counter != nullptr); + + /* Compute prefix sum of number of active paths with each shader. */ + { + const int work_size = 1; + int max_shaders = device_scene_->data.max_shaders; + void *args[] = {&d_counter, &max_shaders}; + queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); + } + + queue_->zero_to_device(num_queued_paths_); + + /* Launch kernel to fill the active paths arrays. */ + { + /* TODO: this could be smaller for terminated paths based on amount of work we want + * to schedule. */ + const int work_size = max_active_path_index_; + + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), + &d_queued_paths, + &d_num_queued_paths, + &d_counter, + &d_queued_kernel}; + + queue_->enqueue(kernel, work_size, args); + } + + if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) { + queue_->zero_to_device(integrator_shader_sort_counter_); + } + else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + else { + assert(0); + } +} + +void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel) +{ + int d_queued_kernel = queued_kernel; + + /* Launch kernel to fill the active paths arrays. */ + const int work_size = max_active_path_index_; + void *d_queued_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = { + const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel}; + + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(kernel, work_size, args); +} + +void PathTraceWorkGPU::compact_states(const int num_active_paths) +{ + if (num_active_paths == 0) { + max_active_path_index_ = 0; + } + + /* Compact fragmented path states into the start of the array, moving any paths + * with index higher than the number of active paths into the gaps. */ + if (max_active_path_index_ == num_active_paths) { + return; + } + + void *d_compact_paths = (void *)queued_paths_.device_pointer; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + + /* Create array with terminated paths that we can write to. */ + { + /* TODO: can the work size be reduced here? */ + int offset = num_active_paths; + int work_size = num_active_paths; + void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args); + } + + /* Create array of paths that we need to compact, where the path index is bigger + * than the number of active paths. */ + { + int work_size = max_active_path_index_; + void *args[] = { + &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)}; + queue_->zero_to_device(num_queued_paths_); + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args); + } + + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + int num_compact_paths = num_queued_paths_.data()[0]; + + /* Move paths into gaps. */ + if (num_compact_paths > 0) { + int work_size = num_compact_paths; + int active_states_offset = 0; + int terminated_states_offset = num_active_paths; + void *args[] = { + &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size}; + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args); + } + + queue_->synchronize(); + + /* Adjust max active path index now we know which part of the array is actually used. */ + max_active_path_index_ = num_active_paths; +} + +bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished) +{ + /* If there are existing paths wait them to go to intersect closest kernel, which will align the + * wavefront of the existing and newely added paths. */ + /* TODO: Check whether counting new intersection kernels here will have positive affect on the + * performance. */ + const DeviceKernel kernel = get_most_queued_kernel(); + if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) { + return false; + } + + int num_active_paths = get_num_active_paths(); + + /* Don't schedule more work if cancelling. */ + if (is_cancel_requested()) { + if (num_active_paths == 0) { + finished = true; + } + return false; + } + + finished = false; + + vector<KernelWorkTile> work_tiles; + + int max_num_camera_paths = max_num_paths_; + int num_predicted_splits = 0; + + if (has_shadow_catcher()) { + /* When there are shadow catchers in the scene bounce from them will split the state. So we + * make sure there is enough space in the path states array to fit split states. + * + * Basically, when adding N new paths we ensure that there is 2*N available path states, so + * that all the new paths can be split. + * + * Note that it is possible that some of the current states can still split, so need to make + * sure there is enough space for them as well. */ + + /* Number of currently in-flight states which can still split. */ + const int num_scheduled_possible_split = shadow_catcher_count_possible_splits(); + + const int num_available_paths = max_num_paths_ - num_active_paths; + const int num_new_paths = num_available_paths / 2; + max_num_camera_paths = max(num_active_paths, + num_active_paths + num_new_paths - num_scheduled_possible_split); + num_predicted_splits += num_scheduled_possible_split + num_new_paths; + } + + /* Schedule when we're out of paths or there are too few paths to keep the + * device occupied. */ + int num_paths = num_active_paths; + if (num_paths == 0 || num_paths < min_num_active_paths_) { + /* Get work tiles until the maximum number of path is reached. */ + while (num_paths < max_num_camera_paths) { + KernelWorkTile work_tile; + if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) { + work_tiles.push_back(work_tile); + num_paths += work_tile.w * work_tile.h * work_tile.num_samples; + } + else { + break; + } + } + + /* If we couldn't get any more tiles, we're done. */ + if (work_tiles.size() == 0 && num_paths == 0) { + finished = true; + return false; + } + } + + /* Initialize paths from work tiles. */ + if (work_tiles.size() == 0) { + return false; + } + + /* Compact state array when number of paths becomes small relative to the + * known maximum path index, which makes computing active index arrays slow. */ + compact_states(num_active_paths); + + if (has_shadow_catcher()) { + integrator_next_shadow_catcher_path_index_.data()[0] = num_paths; + queue_->copy_to_device(integrator_next_shadow_catcher_path_index_); + } + + enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE : + DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA, + work_tiles.data(), + work_tiles.size(), + num_active_paths, + num_predicted_splits); + + return true; +} + +void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel, + const KernelWorkTile work_tiles[], + const int num_work_tiles, + const int num_active_paths, + const int num_predicted_splits) +{ + /* Copy work tiles to device. */ + if (work_tiles_.size() < num_work_tiles) { + work_tiles_.alloc(num_work_tiles); + } + + int path_index_offset = num_active_paths; + int max_tile_work_size = 0; + for (int i = 0; i < num_work_tiles; i++) { + KernelWorkTile &work_tile = work_tiles_.data()[i]; + work_tile = work_tiles[i]; + + const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples; + + work_tile.path_index_offset = path_index_offset; + work_tile.work_size = tile_work_size; + + path_index_offset += tile_work_size; + + max_tile_work_size = max(max_tile_work_size, tile_work_size); + } + + queue_->copy_to_device(work_tiles_); + + void *d_work_tiles = (void *)work_tiles_.device_pointer; + void *d_render_buffer = (void *)buffers_->buffer.device_pointer; + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + const_cast<int *>(&num_work_tiles), + &d_render_buffer, + const_cast<int *>(&max_tile_work_size)}; + + queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args); + + max_active_path_index_ = path_index_offset + num_predicted_splits; +} + +int PathTraceWorkGPU::get_num_active_paths() +{ + /* TODO: this is wrong, does not account for duplicates with shadow! */ + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + + int num_paths = 0; + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) { + DCHECK_GE(queue_counter->num_queued[i], 0) + << "Invalid number of queued states for kernel " + << device_kernel_as_string(static_cast<DeviceKernel>(i)); + num_paths += queue_counter->num_queued[i]; + } + + return num_paths; +} + +bool PathTraceWorkGPU::should_use_graphics_interop() +{ + /* There are few aspects with the graphics interop when using multiple devices caused by the fact + * that the GPUDisplay has a single texture: + * + * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when + * attempting to register OpenGL PBO which has been mapped. Which makes sense, because + * otherwise one would run into a conflict of where the source of truth is. */ + if (has_multiple_works()) { + return false; + } + + if (!interop_use_checked_) { + Device *device = queue_->device; + interop_use_ = device->should_use_graphics_interop(); + + if (interop_use_) { + VLOG(2) << "Will be using graphics interop GPU display update."; + } + else { + VLOG(2) << "Will be using naive GPU display update."; + } + + interop_use_checked_ = true; + } + + return interop_use_; +} + +void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (device_->have_error()) { + /* Don't attempt to update GPU display if the device has errors: the error state will make + * wrong decisions to happen about interop, causing more chained bugs. */ + return; + } + + if (!buffers_->buffer.device_pointer) { + LOG(WARNING) << "Request for GPU display update without allocated render buffers."; + return; + } + + if (should_use_graphics_interop()) { + if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) { + return; + } + + /* If error happens when trying to use graphics interop fallback to the native implementation + * and don't attempt to use interop for the further updates. */ + interop_use_ = false; + } + + copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples); +} + +void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + const int full_x = effective_buffer_params_.full_x; + const int full_y = effective_buffer_params_.full_y; + const int width = effective_buffer_params_.width; + const int height = effective_buffer_params_.height; + const int final_width = buffers_->params.width; + const int final_height = buffers_->params.height; + + const int texture_x = full_x - effective_full_params_.full_x; + const int texture_y = full_y - effective_full_params_.full_y; + + /* Re-allocate display memory if needed, and make sure the device pointer is allocated. + * + * NOTE: allocation happens to the final resolution so that no re-allocation happens on every + * change of the resolution divider. However, if the display becomes smaller, shrink the + * allocated memory as well. */ + if (gpu_display_rgba_half_.data_width != final_width || + gpu_display_rgba_half_.data_height != final_height) { + gpu_display_rgba_half_.alloc(final_width, final_height); + /* TODO(sergey): There should be a way to make sure device-side memory is allocated without + * transfering zeroes to the device. */ + queue_->zero_to_device(gpu_display_rgba_half_); + } + + PassAccessor::Destination destination(film_->get_display_pass()); + destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + gpu_display_rgba_half_.copy_from_device(); + + gpu_display->copy_pixels_to_texture( + gpu_display_rgba_half_.data(), texture_x, texture_y, width, height); +} + +bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) +{ + if (!device_graphics_interop_) { + device_graphics_interop_ = queue_->graphics_interop_create(); + } + + const DeviceGraphicsInteropDestination graphics_interop_dst = + gpu_display->graphics_interop_get(); + device_graphics_interop_->set_destination(graphics_interop_dst); + + const device_ptr d_rgba_half = device_graphics_interop_->map(); + if (!d_rgba_half) { + return false; + } + + PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + destination.d_pixels_half_rgba = d_rgba_half; + + get_render_tile_film_pixels(destination, pass_mode, num_samples); + + device_graphics_interop_->unmap(); + + return true; +} + +void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display) +{ + if (!device_graphics_interop_) { + return; + } + gpu_display->graphics_interop_activate(); + device_graphics_interop_ = nullptr; + gpu_display->graphics_interop_deactivate(); +} + +void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination, + PassMode pass_mode, + int num_samples) +{ + const KernelFilm &kfilm = device_scene_->data.film; + + const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode); + const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples); + + pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); +} + +int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset) +{ + const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset); + + if (num_active_pixels) { + enqueue_adaptive_sampling_filter_x(); + enqueue_adaptive_sampling_filter_y(); + queue_->synchronize(); + } + + return num_active_pixels; +} + +int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset) +{ + device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE); + num_active_pixels.alloc(1); + + queue_->zero_to_device(num_active_pixels); + + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&effective_buffer_params_.full_x), + const_cast<int *>(&effective_buffer_params_.full_y), + const_cast<int *>(&effective_buffer_params_.width), + const_cast<int *>(&effective_buffer_params_.height), + &threshold, + &reset, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride, + &num_active_pixels.device_pointer}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args); + + queue_->copy_from_device(num_active_pixels); + queue_->synchronize(); + + return num_active_pixels.data()[0]; +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x() +{ + const int work_size = effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args); +} + +void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y() +{ + const int work_size = effective_buffer_params_.width; + + void *args[] = {&buffers_->buffer.device_pointer, + &effective_buffer_params_.full_x, + &effective_buffer_params_.full_y, + &effective_buffer_params_.width, + &effective_buffer_params_.height, + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args); +} + +void PathTraceWorkGPU::cryptomatte_postproces() +{ + const int work_size = effective_buffer_params_.width * effective_buffer_params_.height; + + void *args[] = {&buffers_->buffer.device_pointer, + const_cast<int *>(&work_size), + &effective_buffer_params_.offset, + &effective_buffer_params_.stride}; + + queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args); +} + +bool PathTraceWorkGPU::copy_render_buffers_from_device() +{ + queue_->copy_from_device(buffers_->buffer); + + /* Synchronize so that the CPU-side buffer is available at the exit of this function. */ + return queue_->synchronize(); +} + +bool PathTraceWorkGPU::copy_render_buffers_to_device() +{ + queue_->copy_to_device(buffers_->buffer); + + /* NOTE: The direct device access to the buffers only happens within this path trace work. The + * rest of communication happens via API calls which involves `copy_render_buffers_from_device()` + * which will perform synchronization as needed. */ + + return true; +} + +bool PathTraceWorkGPU::zero_render_buffers() +{ + queue_->zero_to_device(buffers_->buffer); + + return true; +} + +bool PathTraceWorkGPU::has_shadow_catcher() const +{ + return device_scene_->data.integrator.has_shadow_catcher; +} + +int PathTraceWorkGPU::shadow_catcher_count_possible_splits() +{ + if (max_active_path_index_ == 0) { + return 0; + } + + if (!has_shadow_catcher()) { + return 0; + } + + queue_->zero_to_device(num_queued_paths_); + + const int work_size = max_active_path_index_; + void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; + void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths}; + + queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args); + queue_->copy_from_device(num_queued_paths_); + queue_->synchronize(); + + return num_queued_paths_.data()[0]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h new file mode 100644 index 00000000000..38788122b0d --- /dev/null +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -0,0 +1,165 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state.h" + +#include "device/device_graphics_interop.h" +#include "device/device_memory.h" +#include "device/device_queue.h" + +#include "integrator/path_trace_work.h" +#include "integrator/work_tile_scheduler.h" + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct KernelWorkTile; + +/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized + * to match device queue's number of path states. + * This implementation suits best devices which have a lot of integrator states, such as GPU. */ +class PathTraceWorkGPU : public PathTraceWork { + public: + PathTraceWorkGPU(Device *device, + Film *film, + DeviceScene *device_scene, + bool *cancel_requested_flag); + + virtual void alloc_work_memory() override; + virtual void init_execution() override; + + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; + + virtual void copy_to_gpu_display(GPUDisplay *gpu_display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + + virtual bool copy_render_buffers_from_device() override; + virtual bool copy_render_buffers_to_device() override; + virtual bool zero_render_buffers() override; + + virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override; + virtual void cryptomatte_postproces() override; + + protected: + void alloc_integrator_soa(); + void alloc_integrator_queue(); + void alloc_integrator_sorting(); + void alloc_integrator_path_split(); + + /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */ + DeviceKernel get_most_queued_kernel() const; + + void enqueue_reset(); + + bool enqueue_work_tiles(bool &finished); + void enqueue_work_tiles(DeviceKernel kernel, + const KernelWorkTile work_tiles[], + const int num_work_tiles, + const int num_active_paths, + const int num_predicted_splits); + + bool enqueue_path_iteration(); + void enqueue_path_iteration(DeviceKernel kernel); + + void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); + void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel); + + void compact_states(const int num_active_paths); + + int get_num_active_paths(); + + /* Check whether graphics interop can be used for the GPUDisplay update. */ + bool should_use_graphics_interop(); + + /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the + * device, then copies pixels to the host and pushes them to the `gpu_display`. */ + void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + + /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability + * functionality, avoiding copy of pixels to the host. */ + bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + + /* Synchronously run film conversion kernel and store display result in the given destination. */ + void get_render_tile_film_pixels(const PassAccessor::Destination &destination, + PassMode pass_mode, + int num_samples); + + int adaptive_sampling_convergence_check_count_active(float threshold, bool reset); + void enqueue_adaptive_sampling_filter_x(); + void enqueue_adaptive_sampling_filter_y(); + + bool has_shadow_catcher() const; + + /* Count how many currently scheduled paths can still split. */ + int shadow_catcher_count_possible_splits(); + + /* Integrator queue. */ + unique_ptr<DeviceQueue> queue_; + + /* Scheduler which gives work to path tracing threads. */ + WorkTileScheduler work_tile_scheduler_; + + /* Integrate state for paths. */ + IntegratorStateGPU integrator_state_gpu_; + /* SoA arrays for integrator state. */ + vector<unique_ptr<device_memory>> integrator_state_soa_; + uint integrator_state_soa_kernel_features_; + /* Keep track of number of queued kernels. */ + device_vector<IntegratorQueueCounter> integrator_queue_counter_; + /* Shader sorting. */ + device_vector<int> integrator_shader_sort_counter_; + device_vector<int> integrator_shader_raytrace_sort_counter_; + /* Path split. */ + device_vector<int> integrator_next_shadow_catcher_path_index_; + + /* Temporary buffer to get an array of queued path for a particular kernel. */ + device_vector<int> queued_paths_; + device_vector<int> num_queued_paths_; + + /* Temporary buffer for passing work tiles to kernel. */ + device_vector<KernelWorkTile> work_tiles_; + + /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not + * available. Is allocated on-demand. */ + device_vector<half4> gpu_display_rgba_half_; + + unique_ptr<DeviceGraphicsInterop> device_graphics_interop_; + + /* Cached result of device->should_use_graphics_interop(). */ + bool interop_use_checked_ = false; + bool interop_use_ = false; + + /* Maximum number of concurrent integrator states. */ + int max_num_paths_; + + /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below + * this value more work will be scheduled. */ + int min_num_active_paths_; + + /* Maximum path index, effective number of paths used may be smaller than + * the size of the integrator_state_ buffer so can avoid iterating over the + * full buffer. */ + int max_active_path_index_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp new file mode 100644 index 00000000000..4eb1dd941f9 --- /dev/null +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -0,0 +1,1187 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/render_scheduler.h" + +#include "render/session.h" +#include "render/tile.h" +#include "util/util_logging.h" +#include "util/util_math.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Render scheduler. + */ + +RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams ¶ms) + : headless_(params.headless), + background_(params.background), + pixel_size_(params.pixel_size), + tile_manager_(tile_manager), + default_start_resolution_divider_(pixel_size_ * 8) +{ + use_progressive_noise_floor_ = !background_; +} + +void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte) +{ + need_schedule_cryptomatte_ = need_schedule_cryptomatte; +} + +void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance) +{ + need_schedule_rebalance_works_ = need_schedule_rebalance; +} + +bool RenderScheduler::is_background() const +{ + return background_; +} + +void RenderScheduler::set_denoiser_params(const DenoiseParams ¶ms) +{ + denoiser_params_ = params; +} + +void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling) +{ + adaptive_sampling_ = adaptive_sampling; +} + +bool RenderScheduler::is_adaptive_sampling_used() const +{ + return adaptive_sampling_.use; +} + +void RenderScheduler::set_start_sample(int start_sample) +{ + start_sample_ = start_sample; +} + +int RenderScheduler::get_start_sample() const +{ + return start_sample_; +} + +void RenderScheduler::set_num_samples(int num_samples) +{ + num_samples_ = num_samples; +} + +int RenderScheduler::get_num_samples() const +{ + return num_samples_; +} + +void RenderScheduler::set_time_limit(double time_limit) +{ + time_limit_ = time_limit; +} + +double RenderScheduler::get_time_limit() const +{ + return time_limit_; +} + +int RenderScheduler::get_rendered_sample() const +{ + DCHECK_GT(get_num_rendered_samples(), 0); + + return start_sample_ + get_num_rendered_samples() - 1; +} + +int RenderScheduler::get_num_rendered_samples() const +{ + return state_.num_rendered_samples; +} + +void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) +{ + buffer_params_ = buffer_params; + + update_start_resolution_divider(); + + set_num_samples(num_samples); + + /* In background mode never do lower resolution render preview, as it is not really supported + * by the software. */ + if (background_) { + state_.resolution_divider = 1; + } + else { + /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider + * first and then initialized render work. */ + state_.resolution_divider = start_resolution_divider_ * 2; + } + + state_.num_rendered_samples = 0; + state_.last_display_update_time = 0.0; + state_.last_display_update_sample = -1; + + state_.last_rebalance_time = 0.0; + state_.num_rebalance_requested = 0; + state_.num_rebalance_changes = 0; + state_.last_rebalance_changed = false; + state_.need_rebalance_at_next_work = false; + + /* TODO(sergey): Choose better initial value. */ + /* NOTE: The adaptive sampling settings might not be available here yet. */ + state_.adaptive_sampling_threshold = 0.4f; + + state_.last_work_tile_was_denoised = false; + state_.tile_result_was_written = false; + state_.postprocess_work_scheduled = false; + state_.full_frame_work_scheduled = false; + state_.full_frame_was_written = false; + + state_.path_trace_finished = false; + + state_.start_render_time = 0.0; + state_.end_render_time = 0.0; + state_.time_limit_reached = false; + + state_.occupancy_num_samples = 0; + state_.occupancy = 1.0f; + + first_render_time_.path_trace_per_sample = 0.0; + first_render_time_.denoise_time = 0.0; + first_render_time_.display_update_time = 0.0; + + path_trace_time_.reset(); + denoise_time_.reset(); + adaptive_filter_time_.reset(); + display_update_time_.reset(); + rebalance_time_.reset(); +} + +void RenderScheduler::reset_for_next_tile() +{ + reset(buffer_params_, num_samples_); +} + +bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work) +{ + /* Move to the next resolution divider. Assume adaptive filtering is not needed during + * navigation. */ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (render_work_reschedule_on_idle(render_work)) { + return true; + } + + state_.path_trace_finished = true; + + bool denoiser_delayed, denoiser_ready_to_display; + render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display); + + render_work.display.update = work_need_update_display(denoiser_delayed); + render_work.display.use_denoised_result = denoiser_ready_to_display; + + return false; +} + +bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work) +{ + if (!use_progressive_noise_floor_) { + return false; + } + + /* Move to the next resolution divider. Assume adaptive filtering is not needed during + * navigation. */ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (adaptive_sampling_.use) { + if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) { + state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2, + adaptive_sampling_.threshold); + + render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold; + render_work.adaptive_sampling.reset = true; + + return true; + } + } + + return false; +} + +void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work) +{ + VLOG(3) << "Schedule work for cancel."; + + /* Un-schedule samples: they will not be rendered and should not be counted. */ + state_.num_rendered_samples -= render_work.path_trace.num_samples; + + const bool has_rendered_samples = get_num_rendered_samples() != 0; + + /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and + * denoising. + * However, need to preserve write requests, since those will not be possible to recover and + * writes are only to happen once. */ + const bool tile_write = render_work.tile.write; + const bool full_write = render_work.full.write; + + render_work = RenderWork(); + + render_work.tile.write = tile_write; + render_work.full.write = full_write; + + /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which + * got cancelled. */ + if (!state_.tile_result_was_written && has_rendered_samples) { + render_work.tile.write = true; + } + + if (!state_.full_frame_was_written) { + render_work.full.write = true; + } + + /* Update current tile, but only if any sample was rendered. + * Allows to have latest state of tile visible while full buffer is being processed. + * + * Note that if there are no samples in the current tile its render buffer might have pixels + * remained from previous state. + * + * If the full result was written, then there is no way any updates were made to the render + * buffers. And the buffers might have been freed from the device, so display update is not + * possible. */ + if (has_rendered_samples && !state_.full_frame_was_written) { + render_work.display.update = true; + } +} + +bool RenderScheduler::done() const +{ + if (state_.resolution_divider != pixel_size_) { + return false; + } + + if (state_.path_trace_finished || state_.time_limit_reached) { + return true; + } + + return get_num_rendered_samples() >= num_samples_; +} + +RenderWork RenderScheduler::get_render_work() +{ + check_time_limit_reached(); + + const double time_now = time_dt(); + + if (done()) { + RenderWork render_work; + render_work.resolution_divider = state_.resolution_divider; + + if (!set_postprocess_render_work(&render_work)) { + set_full_frame_render_work(&render_work); + } + + if (!render_work) { + state_.end_render_time = time_now; + } + + update_state_for_render_work(render_work); + + return render_work; + } + + RenderWork render_work; + + if (state_.resolution_divider != pixel_size_) { + state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_); + state_.num_rendered_samples = 0; + state_.last_display_update_sample = -1; + } + + render_work.resolution_divider = state_.resolution_divider; + + render_work.path_trace.start_sample = get_start_sample_to_path_trace(); + render_work.path_trace.num_samples = get_num_samples_to_path_trace(); + + render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample()); + + /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */ + render_work.rebalance = work_need_rebalance(); + + /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the + * samples are rendered. */ + state_.num_rendered_samples += render_work.path_trace.num_samples; + + render_work.adaptive_sampling.filter = work_need_adaptive_filter(); + render_work.adaptive_sampling.threshold = work_adaptive_threshold(); + render_work.adaptive_sampling.reset = false; + + bool denoiser_delayed, denoiser_ready_to_display; + render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display); + + render_work.tile.write = done(); + + render_work.display.update = work_need_update_display(denoiser_delayed); + render_work.display.use_denoised_result = denoiser_ready_to_display; + + if (done()) { + set_postprocess_render_work(&render_work); + } + + update_state_for_render_work(render_work); + + return render_work; +} + +void RenderScheduler::update_state_for_render_work(const RenderWork &render_work) +{ + const double time_now = time_dt(); + + if (render_work.rebalance) { + state_.last_rebalance_time = time_now; + ++state_.num_rebalance_requested; + } + + /* A fallback display update time, for the case there is an error of display update, or when + * there is no display at all. */ + if (render_work.display.update) { + state_.last_display_update_time = time_now; + state_.last_display_update_sample = state_.num_rendered_samples; + } + + state_.last_work_tile_was_denoised = render_work.tile.denoise; + state_.tile_result_was_written |= render_work.tile.write; + state_.full_frame_was_written |= render_work.full.write; +} + +bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work) +{ + if (state_.postprocess_work_scheduled) { + return false; + } + state_.postprocess_work_scheduled = true; + + bool any_scheduled = false; + + if (need_schedule_cryptomatte_) { + render_work->cryptomatte.postprocess = true; + any_scheduled = true; + } + + if (denoiser_params_.use && !state_.last_work_tile_was_denoised) { + render_work->tile.denoise = true; + any_scheduled = true; + } + + if (!state_.tile_result_was_written) { + render_work->tile.write = true; + any_scheduled = true; + } + + if (any_scheduled) { + render_work->display.update = true; + } + + return any_scheduled; +} + +void RenderScheduler::set_full_frame_render_work(RenderWork *render_work) +{ + if (state_.full_frame_work_scheduled) { + return; + } + + if (!tile_manager_.has_multiple_tiles()) { + /* There is only single tile, so all work has been performed already. */ + return; + } + + if (!tile_manager_.done()) { + /* There are still tiles to be rendered. */ + return; + } + + if (state_.full_frame_was_written) { + return; + } + + state_.full_frame_work_scheduled = true; + + render_work->full.write = true; +} + +/* Knowing time which it took to complete a task at the current resolution divider approximate how + * long it would have taken to complete it at a final resolution. */ +static double approximate_final_time(const RenderWork &render_work, double time) +{ + if (render_work.resolution_divider == 1) { + return time; + } + + const double resolution_divider_sq = render_work.resolution_divider * + render_work.resolution_divider; + return time * resolution_divider_sq; +} + +void RenderScheduler::report_work_begin(const RenderWork &render_work) +{ + /* Start counting render time when rendering samples at their final resolution. + * + * NOTE: The work might have the path trace part be all zero: this happens when a post-processing + * work is scheduled after the path tracing. Checking for just a start sample doesn't work here + * because it might be wrongly 0. Check for whether path tracing is actually happening as it is + * expected to happen in the first work. */ + if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 && + render_work.path_trace.start_sample == get_start_sample()) { + state_.start_render_time = time_dt(); + } +} + +void RenderScheduler::report_path_trace_time(const RenderWork &render_work, + double time, + bool is_cancelled) +{ + path_trace_time_.add_wall(time); + + if (is_cancelled) { + return; + } + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.path_trace_per_sample = final_time_approx / + render_work.path_trace.num_samples; + } + + if (work_report_reset_average(render_work)) { + path_trace_time_.reset_average(); + } + + path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples); + + VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds."; +} + +void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy) +{ + state_.occupancy_num_samples = render_work.path_trace.num_samples; + state_.occupancy = occupancy; + VLOG(4) << "Measured path tracing occupancy: " << occupancy; +} + +void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work, + double time, + bool is_cancelled) +{ + adaptive_filter_time_.add_wall(time); + + if (is_cancelled) { + return; + } + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_report_reset_average(render_work)) { + adaptive_filter_time_.reset_average(); + } + + adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples); + + VLOG(4) << "Average adaptive sampling filter time: " << adaptive_filter_time_.get_average() + << " seconds."; +} + +void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time) +{ + denoise_time_.add_wall(time); + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.denoise_time = final_time_approx; + } + + if (work_report_reset_average(render_work)) { + denoise_time_.reset_average(); + } + + denoise_time_.add_average(final_time_approx); + + VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds."; +} + +void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time) +{ + display_update_time_.add_wall(time); + + const double final_time_approx = approximate_final_time(render_work, time); + + if (work_is_usable_for_first_render_estimation(render_work)) { + first_render_time_.display_update_time = final_time_approx; + } + + if (work_report_reset_average(render_work)) { + display_update_time_.reset_average(); + } + + display_update_time_.add_average(final_time_approx); + + VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds."; + + /* Move the display update moment further in time, so that logic which checks when last update + * did happen have more reliable point in time (without path tracing and denoising parts of the + * render work). */ + state_.last_display_update_time = time_dt(); +} + +void RenderScheduler::report_rebalance_time(const RenderWork &render_work, + double time, + bool balance_changed) +{ + rebalance_time_.add_wall(time); + + if (work_report_reset_average(render_work)) { + rebalance_time_.reset_average(); + } + + rebalance_time_.add_average(time); + + if (balance_changed) { + ++state_.num_rebalance_changes; + } + + state_.last_rebalance_changed = balance_changed; + + VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds."; +} + +string RenderScheduler::full_report() const +{ + const double render_wall_time = state_.end_render_time - state_.start_render_time; + const int num_rendered_samples = get_num_rendered_samples(); + + string result = "\nRender Scheduler Summary\n\n"; + + { + string mode; + if (headless_) { + mode = "Headless"; + } + else if (background_) { + mode = "Background"; + } + else { + mode = "Interactive"; + } + result += "Mode: " + mode + "\n"; + } + + result += "Resolution: " + to_string(buffer_params_.width) + "x" + + to_string(buffer_params_.height) + "\n"; + + result += "\nAdaptive sampling:\n"; + result += " Use: " + string_from_bool(adaptive_sampling_.use) + "\n"; + if (adaptive_sampling_.use) { + result += " Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n"; + result += " Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n"; + result += " Threshold: " + to_string(adaptive_sampling_.threshold) + "\n"; + } + + result += "\nDenoiser:\n"; + result += " Use: " + string_from_bool(denoiser_params_.use) + "\n"; + if (denoiser_params_.use) { + result += " Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n"; + result += " Start Sample: " + to_string(denoiser_params_.start_sample) + "\n"; + + string passes = "Color"; + if (denoiser_params_.use_pass_albedo) { + passes += ", Albedo"; + } + if (denoiser_params_.use_pass_normal) { + passes += ", Normal"; + } + + result += " Passes: " + passes + "\n"; + } + + if (state_.num_rebalance_requested) { + result += "\nRebalancer:\n"; + result += " Number of requested rebalances: " + to_string(state_.num_rebalance_requested) + + "\n"; + result += " Number of performed rebalances: " + to_string(state_.num_rebalance_changes) + + "\n"; + } + + result += "\nTime (in seconds):\n"; + result += string_printf(" %20s %20s %20s\n", "", "Wall", "Average"); + result += string_printf(" %20s %20f %20f\n", + "Path Tracing", + path_trace_time_.get_wall(), + path_trace_time_.get_average()); + + if (adaptive_sampling_.use) { + result += string_printf(" %20s %20f %20f\n", + "Adaptive Filter", + adaptive_filter_time_.get_wall(), + adaptive_filter_time_.get_average()); + } + + if (denoiser_params_.use) { + result += string_printf( + " %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average()); + } + + result += string_printf(" %20s %20f %20f\n", + "Display Update", + display_update_time_.get_wall(), + display_update_time_.get_average()); + + if (state_.num_rebalance_requested) { + result += string_printf(" %20s %20f %20f\n", + "Rebalance", + rebalance_time_.get_wall(), + rebalance_time_.get_average()); + } + + const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() + + denoise_time_.get_wall() + display_update_time_.get_wall(); + result += "\n Total: " + to_string(total_time) + "\n"; + + result += string_printf( + "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time); + + /* When adaptive sampling is used the average time becomes meaningless, because different samples + * will likely render different number of pixels. */ + if (!adaptive_sampling_.use) { + result += string_printf("Average time per sample: %f seconds\n", + render_wall_time / num_rendered_samples); + } + + return result; +} + +double RenderScheduler::guess_display_update_interval_in_seconds() const +{ + return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples); +} + +double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples( + int num_rendered_samples) const +{ + double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit( + num_rendered_samples); + + if (time_limit_ != 0.0 && state_.start_render_time != 0.0) { + const double remaining_render_time = max(0.0, + time_limit_ - (time_dt() - state_.start_render_time)); + + update_interval = min(update_interval, remaining_render_time); + } + + return update_interval; +} + +/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based + * on a more careful experiments with viewport rendering. */ +double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit( + int num_rendered_samples) const +{ + /* TODO(sergey): Need a decision on whether this should be using number of samples rendered + * within the current render session, or use absolute number of samples with the start sample + * taken into account. It will depend on whether the start sample offset clears the render + * buffer. */ + + if (state_.need_rebalance_at_next_work) { + return 0.1; + } + if (state_.last_rebalance_changed) { + return 0.2; + } + + if (headless_) { + /* In headless mode do rare updates, so that the device occupancy is high, but there are still + * progress messages printed to the logs. */ + return 30.0; + } + + if (background_) { + if (num_rendered_samples < 32) { + return 1.0; + } + return 2.0; + } + + /* Render time and number of samples rendered are used to figure out the display update interval. + * Render time is used to allow for fast display updates in the first few seconds of rendering + * on fast devices. Number of samples rendered is used to allow for potentially quicker display + * updates on slow devices during the first few samples. */ + const double render_time = path_trace_time_.get_wall(); + if (render_time < 1) { + return 0.1; + } + if (render_time < 2) { + return 0.25; + } + if (render_time < 4) { + return 0.5; + } + if (render_time < 8 || num_rendered_samples < 32) { + return 1.0; + } + return 2.0; +} + +int RenderScheduler::calculate_num_samples_per_update() const +{ + const double time_per_sample_average = path_trace_time_.get_average(); + const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average; + + const double update_interval_in_seconds = guess_display_update_interval_in_seconds(); + + return max(int(num_samples_in_second * update_interval_in_seconds), 1); +} + +int RenderScheduler::get_start_sample_to_path_trace() const +{ + return start_sample_ + state_.num_rendered_samples; +} + +/* Round number of samples to the closest power of two. + * Rounding might happen to higher or lower value depending on which one is closer. Such behavior + * allows to have number of samples to be power of two without diverging from the planned number of + * samples too much. */ +static inline uint round_num_samples_to_power_of_2(const uint num_samples) +{ + if (num_samples == 1) { + return 1; + } + + if (is_power_of_two(num_samples)) { + return num_samples; + } + + const uint num_samples_up = next_power_of_two(num_samples); + const uint num_samples_down = num_samples_up - (num_samples_up >> 1); + + const uint delta_up = num_samples_up - num_samples; + const uint delta_down = num_samples - num_samples_down; + + if (delta_up <= delta_down) { + return num_samples_up; + } + + return num_samples_down; +} + +int RenderScheduler::get_num_samples_to_path_trace() const +{ + if (state_.resolution_divider != pixel_size_) { + return get_num_samples_during_navigation(state_.resolution_divider); + } + + /* Always start full resolution render with a single sample. Gives more instant feedback to + * artists, and allows to gather information for a subsequent path tracing works. Do it in the + * headless mode as well, to give some estimate of how long samples are taking. */ + if (state_.num_rendered_samples == 0) { + return 1; + } + + const int num_samples_per_update = calculate_num_samples_per_update(); + const int path_trace_start_sample = get_start_sample_to_path_trace(); + + /* Round number of samples to a power of two, so that division of path states into tiles goes in + * a more integer manner. + * This might make it so updates happens more rarely due to rounding up. In the test scenes this + * is not huge deal because it is not seen that more than 8 samples can be rendered between + * updates. If that becomes a problem we can add some extra rules like never allow to round up + * more than N samples. */ + const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update); + + const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample; + + int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render); + + /* When enough statistics is available and doing an offlien rendering prefer to keep device + * occupied. */ + if (state_.occupancy_num_samples && (background_ || headless_)) { + /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes + * with good performance without forcing occupancy to be higher). */ + int num_samples_to_occupy = state_.occupancy_num_samples; + if (state_.occupancy < 0.5f) { + num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy); + } + + num_samples_to_render = max(num_samples_to_render, + min(num_samples_to_occupy, max_num_samples_to_render)); + } + + /* If adaptive sampling is not use, render as many samples per update as possible, keeping the + * device fully occupied, without much overhead of display updates. */ + if (!adaptive_sampling_.use) { + return num_samples_to_render; + } + + /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This + * is to ensure that the final render is pixel-matched regardless of how many samples per second + * compute device can do. */ + + return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render); +} + +int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const +{ + /* Special trick for fast navigation: schedule multiple samples during fast navigation + * (which will prefer to use lower resolution to keep up with refresh rate). This gives more + * usable visual feedback for artists. There are a couple of tricks though. */ + + if (is_denoise_active_during_update()) { + /* When denoising is used during navigation prefer using a higher resolution with less samples + * (scheduling less samples here will make it so the resolution_divider calculation will use a + * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser + * give visually better results on a higher resolution image with less samples. */ + return 1; + } + + if (resolution_divider <= pixel_size_) { + /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect + * the sample count at this resolution division, but instead assists in the calculation of + * the resolution divider. */ + return 1; + } + + if (resolution_divider == pixel_size_ * 2) { + /* When resolution divider is the previous step to the final resolution, schedule two samples. + * This is so that rendering on lower resolution does not exceed time that it takes to render + * first sample at the full resolution. */ + return 2; + } + + /* Always render 4 samples, even if scene is configured for less. + * The idea here is to have enough information on the screen. Resolution divider of 2 allows us + * to have 4 time extra samples, so verall worst case timing is the same as the final resolution + * at one sample. */ + return 4; +} + +bool RenderScheduler::work_need_adaptive_filter() const +{ + return adaptive_sampling_.need_filter(get_rendered_sample()); +} + +float RenderScheduler::work_adaptive_threshold() const +{ + if (!use_progressive_noise_floor_) { + return adaptive_sampling_.threshold; + } + + return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold); +} + +bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display) +{ + delayed = false; + ready_to_display = true; + + if (!denoiser_params_.use) { + /* Denoising is disabled, no need to scheduler work for it. */ + return false; + } + + if (done()) { + /* Always denoise at the last sample. */ + return true; + } + + if (background_) { + /* Background render, only denoise when rendering the last sample. */ + /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised + * image looks like even for the background rendering. */ + return false; + } + + /* Viewport render. */ + + /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as + * final samples. */ + const int num_samples_finished = state_.resolution_divider == pixel_size_ ? + state_.num_rendered_samples : + 1; + + /* Immediately denoise when we reach the start sample or last sample. */ + if (num_samples_finished == denoiser_params_.start_sample || + num_samples_finished == num_samples_) { + return true; + } + + /* Do not denoise until the sample at which denoising should start is reached. */ + if (num_samples_finished < denoiser_params_.start_sample) { + ready_to_display = false; + return false; + } + + /* Avoid excessive denoising in viewport after reaching a certain sample count and render time. + */ + /* TODO(sergey): Consider making time interval and sample configurable. */ + delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 && + (time_dt() - state_.last_display_update_time) < 1.0); + + return !delayed; +} + +bool RenderScheduler::work_need_update_display(const bool denoiser_delayed) +{ + if (headless_) { + /* Force disable display update in headless mode. There will be nothing to display the + * in-progress result. */ + return false; + } + + if (denoiser_delayed) { + /* If denoiser has been delayed the display can not be updated as it will not contain + * up-to-date state of the render result. */ + return false; + } + + if (!adaptive_sampling_.use) { + /* When adaptive sampling is not used the work is scheduled in a way that they keep render + * device busy for long enough, so that the display update can happen right after the + * rendering. */ + return true; + } + + if (done() || state_.last_display_update_sample == -1) { + /* Make sure an initial and final results of adaptive sampling is communicated ot the display. + */ + return true; + } + + /* For the development purposes of adaptive sampling it might be very useful to see all updates + * of active pixels after convergence check. However, it would cause a slowdown for regular usage + * users. Possibly, make it a debug panel option to allow rapid update to ease development + * without need to re-compiled. */ + // if (work_need_adaptive_filter()) { + // return true; + // } + + /* When adaptive sampling is used, its possible that only handful of samples of a very simple + * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points). + * We take care of skipping updates here based on when previous display update did happen. */ + const double update_interval = guess_display_update_interval_in_seconds_for_num_samples( + state_.last_display_update_sample); + return (time_dt() - state_.last_display_update_time) > update_interval; +} + +bool RenderScheduler::work_need_rebalance() +{ + /* This is the minimum time, as the rebalancing can not happen more often than the path trace + * work. */ + static const double kRebalanceIntervalInSeconds = 1; + + if (!need_schedule_rebalance_works_) { + return false; + } + + if (state_.resolution_divider != pixel_size_) { + /* Don't rebalance at a non-final resolution divider. Some reasons for this: + * - It will introduce unnecessary during navigation. + * - Per-render device timing information is not very reliable yet. */ + return false; + } + + if (state_.num_rendered_samples == 0) { + state_.need_rebalance_at_next_work = true; + return false; + } + + if (state_.need_rebalance_at_next_work) { + state_.need_rebalance_at_next_work = false; + return true; + } + + if (state_.last_rebalance_changed) { + return true; + } + + return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds; +} + +void RenderScheduler::update_start_resolution_divider() +{ + if (start_resolution_divider_ == 0) { + /* Resolution divider has never been calculated before: use default resolution, so that we have + * somewhat good initial behavior, giving a chance to collect real numbers. */ + start_resolution_divider_ = default_start_resolution_divider_; + VLOG(3) << "Initial resolution divider is " << start_resolution_divider_; + return; + } + + if (first_render_time_.path_trace_per_sample == 0.0) { + /* Not enough information to calculate better resolution, keep the existing one. */ + return; + } + + const double desired_update_interval_in_seconds = + guess_viewport_navigation_update_interval_in_seconds(); + + const double actual_time_per_update = first_render_time_.path_trace_per_sample + + first_render_time_.denoise_time + + first_render_time_.display_update_time; + + /* Allow some percent of tolerance, so that if the render time is close enough to the higher + * resolution we prefer to use it instead of going way lower resolution and time way below the + * desired one. */ + const int resolution_divider_for_update = calculate_resolution_divider_for_time( + desired_update_interval_in_seconds * 1.4, actual_time_per_update); + + /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual + * render time is somewhere on a boundary between two resolutions. */ + + /* Never increase resolution to higher than the pixel size (which is possible if the scene is + * simple and compute device is fast). */ + start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_); + + VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_; +} + +double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const +{ + if (is_denoise_active_during_update()) { + /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the + * image from. With the faster updates and extra compute required the resolution becomes too + * low to give usable feedback. */ + /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser + * on GPU the value might need to become lower for faster navigation. */ + return 1.0 / 12.0; + } + + /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will + * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high + * values of the resolution divider which does not give very pleasant updates during navigation. + * Choose less frequent updates to allow more noise-free and higher resolution updates. */ + + /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider + * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */ + + return 1.0 / 30.0; +} + +bool RenderScheduler::is_denoise_active_during_update() const +{ + if (!denoiser_params_.use) { + return false; + } + + if (denoiser_params_.start_sample > 1) { + return false; + } + + return true; +} + +bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work) +{ + return render_work.resolution_divider == pixel_size_ && + render_work.path_trace.start_sample == start_sample_; +} + +bool RenderScheduler::work_report_reset_average(const RenderWork &render_work) +{ + /* When rendering at a non-final resolution divider time average is not very useful because it + * will either bias average down (due to lower render times on the smaller images) or will give + * incorrect result when trying to estimate time which would have spent on the final resolution. + * + * So we only accumulate average for the latest resolution divider which was rendered. */ + return render_work.resolution_divider != pixel_size_; +} + +void RenderScheduler::check_time_limit_reached() +{ + if (time_limit_ == 0.0) { + /* No limit is enforced. */ + return; + } + + if (state_.start_render_time == 0.0) { + /* Rendering did not start yet. */ + return; + } + + const double current_time = time_dt(); + + if (current_time - state_.start_render_time < time_limit_) { + /* Time limit is not reached yet. */ + return; + } + + state_.time_limit_reached = true; + state_.end_render_time = current_time; +} + +/* -------------------------------------------------------------------- + * Utility functions. + */ + +int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time) +{ + /* TODO(sergey): There should a non-iterative analytical formula here. */ + + int resolution_divider = 1; + + /* This algorithm iterates through resolution dividers until a divider is found that achieves + * the desired render time. A limit of default_start_resolution_divider_ is put in place as the + * maximum resolution divider to avoid an unreadable viewport due to a low resolution. + * pre_resolution_division_samples and post_resolution_division_samples are used in this + * calculation to better predict the performance impact of changing resolution divisions as + * the sample count can also change between resolution divisions. */ + while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) { + int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider); + resolution_divider = resolution_divider * 2; + int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider); + actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples; + } + + return resolution_divider; +} + +int calculate_resolution_divider_for_resolution(int width, int height, int resolution) +{ + if (resolution == INT_MAX) { + return 1; + } + + int resolution_divider = 1; + while (width * height > resolution * resolution) { + width = max(1, width / 2); + height = max(1, height / 2); + + resolution_divider <<= 1; + } + + return resolution_divider; +} + +int calculate_resolution_for_divider(int width, int height, int resolution_divider) +{ + const int pixel_area = width * height; + const int resolution = lround(sqrt(pixel_area)); + + return resolution / resolution_divider; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h new file mode 100644 index 00000000000..9c2d107e46d --- /dev/null +++ b/intern/cycles/integrator/render_scheduler.h @@ -0,0 +1,466 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/adaptive_sampling.h" +#include "integrator/denoiser.h" /* For DenoiseParams. */ +#include "render/buffers.h" +#include "util/util_string.h" + +CCL_NAMESPACE_BEGIN + +class SessionParams; +class TileManager; + +class RenderWork { + public: + int resolution_divider = 1; + + /* Initialize render buffers. + * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the + * baking target. */ + bool init_render_buffers = false; + + /* Path tracing samples information. */ + struct { + int start_sample = 0; + int num_samples = 0; + } path_trace; + + struct { + /* Check for convergency and filter the mask. */ + bool filter = false; + + float threshold = 0.0f; + + /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */ + bool reset = false; + } adaptive_sampling; + + struct { + bool postprocess = false; + } cryptomatte; + + /* Work related on the current tile. */ + struct { + /* Write render buffers of the current tile. + * + * It is up to the path trace to decide whether writing should happen via user-provided + * callback into the rendering software, or via tile manager into a partial file. */ + bool write = false; + + bool denoise = false; + } tile; + + /* Work related on the full-frame render buffer. */ + struct { + /* Write full render result. + * Implies reading the partial file from disk. */ + bool write = false; + } full; + + /* Display which is used to visualize render result. */ + struct { + /* Display needs to be updated for the new render. */ + bool update = false; + + /* Display can use denoised result if available. */ + bool use_denoised_result = true; + } display; + + /* Re-balance multi-device scheduling after rendering this work. + * Note that the scheduler does not know anything abouce devices, so if there is only a single + * device used, then it is up for the PathTracer to ignore the balancing. */ + bool rebalance = false; + + /* Conversion to bool, to simplify checks about whether there is anything to be done for this + * work. */ + inline operator bool() const + { + return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise || + tile.write || full.write; + } +}; + +class RenderScheduler { + public: + RenderScheduler(TileManager &tile_manager, const SessionParams ¶ms); + + /* Specify whether cryptomatte-related works are to be scheduled. */ + void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte); + + /* Allows to disable work re-balancing works, allowing to schedule as much to a single device + * as possible. */ + void set_need_schedule_rebalance(bool need_schedule_rebalance); + + bool is_background() const; + + void set_denoiser_params(const DenoiseParams ¶ms); + void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling); + + bool is_adaptive_sampling_used() const; + + /* Start sample for path tracing. + * The scheduler will schedule work using this sample as the first one. */ + void set_start_sample(int start_sample); + int get_start_sample() const; + + /* Number of samples to render, starting from start sample. + * The scheduler will schedule work in the range of + * [start_sample, start_sample + num_samples - 1], inclusively. */ + void set_num_samples(int num_samples); + int get_num_samples() const; + + /* Time limit for the path tracing tasks, in minutes. + * Zero disables the limit. */ + void set_time_limit(double time_limit); + double get_time_limit() const; + + /* Get sample up to which rendering has been done. + * This is an absolute 0-based value. + * + * For example, if start sample is 10 and and 5 samples were rendered, then this call will + * return 14. + * + * If there were no samples rendered, then the behavior is undefined. */ + int get_rendered_sample() const; + + /* Get number of samples rendered within the current scheduling session. + * + * For example, if start sample is 10 and and 5 samples were rendered, then this call will + * return 5. + * + * Note that this is based on the scheduling information. In practice this means that if someone + * requested for work to render the scheduler considers the work done. */ + int get_num_rendered_samples() const; + + /* Reset scheduler, indicating that rendering will happen from scratch. + * Resets current rendered state, as well as scheduling information. */ + void reset(const BufferParams &buffer_params, int num_samples); + + /* Reset scheduler upon switching to a next tile. + * Will keep the same number of samples and full-frame render parameters, but will reset progress + * and allow schedule renders works from the beginning of the new tile. */ + void reset_for_next_tile(); + + /* Reschedule adaptive sampling work when all pixels did converge. + * If there is nothing else to be done for the adaptive sampling (pixels did converge to the + * final threshold) then false is returned and the render scheduler will stop scheduling path + * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with + * a lower threshold. */ + bool render_work_reschedule_on_converge(RenderWork &render_work); + + /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet + * converged. + * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and + * the path tracer is to finish the current pixels) then false is returned. */ + bool render_work_reschedule_on_idle(RenderWork &render_work); + + /* Reschedule work when rendering has been requested to cancel. + * + * Will skip all work which is not needed anymore because no more samples will be added (for + * example, adaptive sampling filtering and convergence check will be skipped). + * Will enable all work needed to make sure all passes are communicated to the software. + * + * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */ + void render_work_reschedule_on_cancel(RenderWork &render_work); + + RenderWork get_render_work(); + + /* Report that the path tracer started to work, after scene update and loading kernels. */ + void report_work_begin(const RenderWork &render_work); + + /* Report time (in seconds) which corresponding part of work took. */ + void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_path_trace_occupancy(const RenderWork &render_work, float occupancy); + void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_denoise_time(const RenderWork &render_work, double time); + void report_display_update_time(const RenderWork &render_work, double time); + void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed); + + /* Generate full multi-line report of the rendering process, including rendering parameters, + * times, and so on. */ + string full_report() const; + + protected: + /* Check whether all work has been scheduled and time limit was not exceeded. + * + * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some + * extra work needs to be scheduled to denoise and write final result. */ + bool done() const; + + /* Update scheduling state for a newely scheduled work. + * Takes care of things like checking whether work was ever denoised, tile was written and states + * like that. */ + void update_state_for_render_work(const RenderWork &render_work); + + /* Returns true if any work was scheduled. */ + bool set_postprocess_render_work(RenderWork *render_work); + + /* Set work which is to be performed after all tiles has been rendered. */ + void set_full_frame_render_work(RenderWork *render_work); + + /* Update start resolution divider based on the accumulated timing information, preserving nice + * feeling navigation feel. */ + void update_start_resolution_divider(); + + /* Calculate desired update interval in seconds based on the current timings and settings. + * Will give an interval which provides good feeling updates during viewport navigation. */ + double guess_viewport_navigation_update_interval_in_seconds() const; + + /* Check whether denoising is active during interactive update while resolution divider is not + * unit. */ + bool is_denoise_active_during_update() const; + + /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at + * lower samples and near the beginning of rendering, updates happen more often, but with higher + * number of samples and later in the render, updates happen less often but device occupancy + * goes higher. */ + double guess_display_update_interval_in_seconds() const; + double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const; + double guess_display_update_interval_in_seconds_for_num_samples_no_limit( + int num_rendered_samples) const; + + /* Calculate number of samples which can be rendered within current desred update interval which + * is calculated by `guess_update_interval_in_seconds()`. */ + int calculate_num_samples_per_update() const; + + /* Get start sample and the number of samples which are to be path traces in the current work. */ + int get_start_sample_to_path_trace() const; + int get_num_samples_to_path_trace() const; + + /* Calculate how many samples there are to be rendered for the very first path trace after reset. + */ + int get_num_samples_during_navigation(int resolution_divier) const; + + /* Whether adaptive sampling convergence check and filter is to happen. */ + bool work_need_adaptive_filter() const; + + /* Calculate thretshold for adaptive sampling. */ + float work_adaptive_threshold() const; + + /* Check whether current work needs denoising. + * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too + * often. + * + * The delayed will be true when the denoiser is configured for use, but it was delayed for a + * later sample, to reduce overhead. + * + * ready_to_display will be false if we may have a denoised result that is outdated due to + * increased samples. */ + bool work_need_denoise(bool &delayed, bool &ready_to_display); + + /* Check whether current work need to update display. + * + * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */ + bool work_need_update_display(const bool denoiser_delayed); + + /* Check whether it is time to perform rebalancing for the render work, */ + bool work_need_rebalance(); + + /* Check whether timing of the given work are usable to store timings in the `first_render_time_` + * for the resolution divider calculation. */ + bool work_is_usable_for_first_render_estimation(const RenderWork &render_work); + + /* Check whether timing report about the given work need to reset accumulated average time. */ + bool work_report_reset_average(const RenderWork &render_work); + + /* CHeck whether render time limit has been reached (or exceeded), and if so store related + * information in the state so that rendering is considered finished, and is possible to report + * average render time information. */ + void check_time_limit_reached(); + + /* Helper class to keep track of task timing. + * + * Contains two parts: wall time and average. The wall time is an actual wall time of how long it + * took to complete all tasks of a type. Is always advanced when PathTracer reports time update. + * + * The average time is used for scheduling purposes. It is estimated to be a time of how long it + * takes to perform task on the final resolution. */ + class TimeWithAverage { + public: + inline void reset() + { + total_wall_time_ = 0.0; + + average_time_accumulator_ = 0.0; + num_average_times_ = 0; + } + + inline void add_wall(double time) + { + total_wall_time_ += time; + } + + inline void add_average(double time, int num_measurements = 1) + { + average_time_accumulator_ += time; + num_average_times_ += num_measurements; + } + + inline double get_wall() const + { + return total_wall_time_; + } + + inline double get_average() const + { + if (num_average_times_ == 0) { + return 0; + } + return average_time_accumulator_ / num_average_times_; + } + + inline void reset_average() + { + average_time_accumulator_ = 0.0; + num_average_times_ = 0; + } + + protected: + double total_wall_time_ = 0.0; + + double average_time_accumulator_ = 0.0; + int num_average_times_ = 0; + }; + + struct { + int resolution_divider = 1; + + /* Number of rendered samples on top of the start sample. */ + int num_rendered_samples = 0; + + /* Point in time the latest GPUDisplay work has been scheduled. */ + double last_display_update_time = 0.0; + /* Value of -1 means display was never updated. */ + int last_display_update_sample = -1; + + /* Point in time at which last rebalance has been performed. */ + double last_rebalance_time = 0.0; + + /* Number of rebalance works which has been requested to be performed. + * The path tracer might ignore the work if there is a single device rendering. */ + int num_rebalance_requested = 0; + + /* Number of rebalance works handled which did change balance across devices. */ + int num_rebalance_changes = 0; + + bool need_rebalance_at_next_work = false; + + /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across + * devices. */ + bool last_rebalance_changed = false; + + /* Threshold for adaptive sampling which will be scheduled to work when not using progressive + * noise floor. */ + float adaptive_sampling_threshold = 0.0f; + + bool last_work_tile_was_denoised = false; + bool tile_result_was_written = false; + bool postprocess_work_scheduled = false; + bool full_frame_work_scheduled = false; + bool full_frame_was_written = false; + + bool path_trace_finished = false; + bool time_limit_reached = false; + + /* Time at which rendering started and finished. */ + double start_render_time = 0.0; + double end_render_time = 0.0; + + /* Measured occupancy of the render devices measured normalized to the number of samples. + * + * In a way it is "trailing": when scheduling new work this occupancy is measured when the + * previous work was rendered. */ + int occupancy_num_samples = 0; + float occupancy = 1.0f; + } state_; + + /* Timing of tasks which were performed at the very first render work at 100% of the + * resolution. This timing information is used to estimate resolution divider for fats + * navigation. */ + struct { + double path_trace_per_sample; + double denoise_time; + double display_update_time; + } first_render_time_; + + TimeWithAverage path_trace_time_; + TimeWithAverage adaptive_filter_time_; + TimeWithAverage denoise_time_; + TimeWithAverage display_update_time_; + TimeWithAverage rebalance_time_; + + /* Whether cryptomatte-related work will be scheduled. */ + bool need_schedule_cryptomatte_ = false; + + /* Whether to schedule device load rebalance works. + * Rebalancing requires some special treatment for update intervals and such, so if it's known + * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully + * ignore rebalancing logic. */ + bool need_schedule_rebalance_works_ = false; + + /* Path tracing work will be scheduled for samples from within + * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */ + int start_sample_ = 0; + int num_samples_ = 0; + + /* Limit in seconds for how long path tracing is allowed to happen. + * Zero means no limit is applied. */ + double time_limit_ = 0.0; + + /* Headless rendering without interface. */ + bool headless_; + + /* Background (offline) rendering. */ + bool background_; + + /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other + * types of hi-dpi displays. */ + int pixel_size_ = 1; + + TileManager &tile_manager_; + + BufferParams buffer_params_; + DenoiseParams denoiser_params_; + + AdaptiveSampling adaptive_sampling_; + + /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise + * level. */ + bool use_progressive_noise_floor_ = false; + + /* Default value for the resolution divider which will be used when there is no render time + * information available yet. + * It is also what defines the upper limit of the automatically calculated resolution divider. */ + int default_start_resolution_divider_ = 1; + + /* Initial resolution divider which will be used on render scheduler reset. */ + int start_resolution_divider_ = 0; + + /* Calculate smallest resolution divider which will bring down actual rendering time below the + * desired one. This call assumes linear dependency of render time from number of pixels + * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time + * down by a factor of 4. */ + int calculate_resolution_divider_for_time(double desired_time, double actual_time); +}; + +int calculate_resolution_divider_for_resolution(int width, int height, int resolution); + +int calculate_resolution_for_divider(int width, int height, int resolution_divider); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp new file mode 100644 index 00000000000..465b4a8d4da --- /dev/null +++ b/intern/cycles/integrator/shader_eval.cpp @@ -0,0 +1,173 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/shader_eval.h" + +#include "device/device.h" +#include "device/device_queue.h" + +#include "device/cpu/kernel.h" +#include "device/cpu/kernel_thread_globals.h" + +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_tbb.h" + +CCL_NAMESPACE_BEGIN + +ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress) +{ + DCHECK_NE(device_, nullptr); +} + +bool ShaderEval::eval(const ShaderEvalType type, + const int max_num_points, + const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input, + const function<void(device_vector<float4> &)> &read_output) +{ + bool first_device = true; + bool success = true; + + device_->foreach_device([&](Device *device) { + if (!first_device) { + LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a " + "single device."; + return; + } + first_device = false; + + device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY); + device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE); + + /* Allocate and copy device buffers. */ + DCHECK_EQ(input.device, device); + DCHECK_EQ(output.device, device); + DCHECK_LE(output.size(), input.size()); + + input.alloc(max_num_points); + int num_points = fill_input(input); + if (num_points == 0) { + return; + } + + input.copy_to_device(); + output.alloc(num_points); + output.zero_to_device(); + + /* Evaluate on CPU or GPU. */ + success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) : + eval_gpu(device, type, input, output); + + /* Copy data back from device if not cancelled. */ + if (success) { + output.copy_from_device(0, 1, output.size()); + read_output(output); + } + + input.free(); + output.free(); + }); + + return success; +} + +bool ShaderEval::eval_cpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output) +{ + vector<CPUKernelThreadGlobals> kernel_thread_globals; + device->get_cpu_kernel_thread_globals(kernel_thread_globals); + + /* Find required kernel function. */ + const CPUKernels &kernels = *(device->get_cpu_kernels()); + + /* Simple parallel_for over all work items. */ + const int64_t work_size = output.size(); + KernelShaderEvalInput *input_data = input.data(); + float4 *output_data = output.data(); + bool success = true; + + tbb::task_arena local_arena(device->info.cpu_threads); + local_arena.execute([&]() { + tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) { + /* TODO: is this fast enough? */ + if (progress_.get_cancel()) { + success = false; + return; + } + + const int thread_index = tbb::this_task_arena::current_thread_index(); + KernelGlobals *kg = &kernel_thread_globals[thread_index]; + + switch (type) { + case SHADER_EVAL_DISPLACE: + kernels.shader_eval_displace(kg, input_data, output_data, work_index); + break; + case SHADER_EVAL_BACKGROUND: + kernels.shader_eval_background(kg, input_data, output_data, work_index); + break; + } + }); + }); + + return success; +} + +bool ShaderEval::eval_gpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output) +{ + /* Find required kernel function. */ + DeviceKernel kernel; + switch (type) { + case SHADER_EVAL_DISPLACE: + kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE; + break; + case SHADER_EVAL_BACKGROUND: + kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND; + break; + }; + + /* Create device queue. */ + unique_ptr<DeviceQueue> queue = device->gpu_queue_create(); + queue->init_execution(); + + /* Execute work on GPU in chunk, so we can cancel. + * TODO : query appropriate size from device.*/ + const int chunk_size = 65536; + + const int work_size = output.size(); + void *d_input = (void *)input.device_pointer; + void *d_output = (void *)output.device_pointer; + + for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) { + int d_work_size = min(chunk_size, work_size - d_offset); + void *args[] = {&d_input, &d_output, &d_offset, &d_work_size}; + + queue->enqueue(kernel, d_work_size, args); + queue->synchronize(); + + if (progress_.get_cancel()) { + return false; + } + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h new file mode 100644 index 00000000000..7dbf334b8d7 --- /dev/null +++ b/intern/cycles/integrator/shader_eval.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_memory.h" + +#include "kernel/kernel_types.h" + +#include "util/util_function.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class Progress; + +enum ShaderEvalType { + SHADER_EVAL_DISPLACE, + SHADER_EVAL_BACKGROUND, +}; + +/* ShaderEval class performs shader evaluation for background light and displacement. */ +class ShaderEval { + public: + ShaderEval(Device *device, Progress &progress); + + /* Evaluate shader at points specified by KernelShaderEvalInput and write out + * RGBA colors to output. */ + bool eval(const ShaderEvalType type, + const int max_num_points, + const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input, + const function<void(device_vector<float4> &)> &read_output); + + protected: + bool eval_cpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output); + bool eval_gpu(Device *device, + const ShaderEvalType type, + device_vector<KernelShaderEvalInput> &input, + device_vector<float4> &output); + + Device *device_; + Progress &progress_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp new file mode 100644 index 00000000000..3387b7bedf1 --- /dev/null +++ b/intern/cycles/integrator/tile.cpp @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/tile.h" + +#include "util/util_logging.h" +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +std::ostream &operator<<(std::ostream &os, const TileSize &tile_size) +{ + os << "size: (" << tile_size.width << ", " << tile_size.height << ")"; + os << ", num_samples: " << tile_size.num_samples; + return os; +} + +ccl_device_inline uint round_down_to_power_of_two(uint x) +{ + if (is_power_of_two(x)) { + return x; + } + + return prev_power_of_two(x); +} + +ccl_device_inline uint round_up_to_power_of_two(uint x) +{ + if (is_power_of_two(x)) { + return x; + } + + return next_power_of_two(x); +} + +TileSize tile_calculate_best_size(const int2 &image_size, + const int num_samples, + const int max_num_path_states) +{ + if (max_num_path_states == 1) { + /* Simple case: avoid any calculation, which could cause rounding issues. */ + return TileSize(1, 1, 1); + } + + const int64_t num_pixels = image_size.x * image_size.y; + const int64_t num_pixel_samples = num_pixels * num_samples; + + if (max_num_path_states >= num_pixel_samples) { + /* Image fully fits into the state (could be border render, for example). */ + return TileSize(image_size.x, image_size.y, num_samples); + } + + /* The idea here is to keep number of samples per tile as much as possible to improve coherency + * across threads. + * + * Some general ideas: + * - Prefer smaller tiles with more samples, which improves spatial coherency of paths. + * - Keep values a power of two, for more integer fit into the maximum number of paths. */ + + TileSize tile_size; + + /* Calculate tile size as if it is the most possible one to fit an entire range of samples. + * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling + * multiple tiles with the same coordinates rendering different samples. */ + const int num_path_states_per_sample = max_num_path_states / num_samples; + if (num_path_states_per_sample != 0) { + tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample))); + tile_size.height = tile_size.width; + } + else { + tile_size.width = tile_size.height = 1; + } + + if (num_samples == 1) { + tile_size.num_samples = 1; + } + else { + /* Heuristic here is to have more uniform division of the sample range: for example prefer + * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */ + tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))), + static_cast<uint>(num_samples)); + + const int tile_area = tile_size.width / tile_size.height; + tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area); + } + + DCHECK_GE(tile_size.width, 1); + DCHECK_GE(tile_size.height, 1); + DCHECK_GE(tile_size.num_samples, 1); + DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states); + + return tile_size; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h new file mode 100644 index 00000000000..d0824843ddb --- /dev/null +++ b/intern/cycles/integrator/tile.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <ostream> + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +struct TileSize { + TileSize() = default; + + inline TileSize(int width, int height, int num_samples) + : width(width), height(height), num_samples(num_samples) + { + } + + inline bool operator==(const TileSize &other) const + { + return width == other.width && height == other.height && num_samples == other.num_samples; + } + inline bool operator!=(const TileSize &other) const + { + return !(*this == other); + } + + int width = 0, height = 0; + int num_samples = 0; +}; + +std::ostream &operator<<(std::ostream &os, const TileSize &tile_size); + +/* Calculate tile size which is best suitable for rendering image of a given size with given number + * of active path states. + * Will attempt to provide best guess to keep path tracing threads of a device as localized as + * possible, and have as many threads active for every tile as possible. */ +TileSize tile_calculate_best_size(const int2 &image_size, + const int num_samples, + const int max_num_path_states); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp new file mode 100644 index 00000000000..9f96fe3632b --- /dev/null +++ b/intern/cycles/integrator/work_balancer.cpp @@ -0,0 +1,99 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/work_balancer.h" + +#include "util/util_math.h" + +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos) +{ + const int num_infos = work_balance_infos.size(); + + if (num_infos == 1) { + work_balance_infos[0].weight = 1.0; + return; + } + + /* There is no statistics available, so start with an equal distribution. */ + const double weight = 1.0 / num_infos; + for (WorkBalanceInfo &balance_info : work_balance_infos) { + balance_info.weight = weight; + } +} + +static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos) +{ + double total_time = 0; + for (const WorkBalanceInfo &info : work_balance_infos) { + total_time += info.time_spent; + } + return total_time; +} + +/* The balance is based on equalizing time which devices spent performing a task. Assume that + * average of the observed times is usable for estimating whether more or less work is to be + * scheduled, and how difference in the work scheduling is needed. */ + +bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos) +{ + const int num_infos = work_balance_infos.size(); + + const double total_time = calculate_total_time(work_balance_infos); + const double time_average = total_time / num_infos; + + double total_weight = 0; + vector<double> new_weights; + new_weights.reserve(num_infos); + + /* Equalize the overall average time. This means that we don't make it so every work will perform + * amount of work based on the current average, but that after the weights changes the time will + * equalize. + * Can think of it that if one of the devices is 10% faster than another, then one device needs + * to do 5% less of the current work, and another needs to do 5% more. */ + const double lerp_weight = 1.0 / num_infos; + + bool has_big_difference = false; + + for (const WorkBalanceInfo &info : work_balance_infos) { + const double time_target = lerp(info.time_spent, time_average, lerp_weight); + const double new_weight = info.weight * time_target / info.time_spent; + new_weights.push_back(new_weight); + total_weight += new_weight; + + if (std::fabs(1.0 - time_target / time_average) > 0.02) { + has_big_difference = true; + } + } + + if (!has_big_difference) { + return false; + } + + const double total_weight_inv = 1.0 / total_weight; + for (int i = 0; i < num_infos; ++i) { + WorkBalanceInfo &info = work_balance_infos[i]; + info.weight = new_weights[i] * total_weight_inv; + info.time_spent = 0; + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h new file mode 100644 index 00000000000..94e20ecf054 --- /dev/null +++ b/intern/cycles/integrator/work_balancer.h @@ -0,0 +1,42 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct WorkBalanceInfo { + /* Time spent performing corresponding work. */ + double time_spent = 0; + + /* Average occupancy of the device while performing the work. */ + float occupancy = 1.0f; + + /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of + * the big tile which is to be rendered on the device). */ + double weight = 1.0; +}; + +/* Balance work for an initial render interation, before any statistics is known. */ +void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos); + +/* Rebalance work after statistics has been accumulated. + * Returns true if the balancing did change. */ +bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos); + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp new file mode 100644 index 00000000000..3fc99d5b74d --- /dev/null +++ b/intern/cycles/integrator/work_tile_scheduler.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/work_tile_scheduler.h" + +#include "device/device_queue.h" +#include "integrator/tile.h" +#include "render/buffers.h" +#include "util/util_atomic.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +WorkTileScheduler::WorkTileScheduler() +{ +} + +void WorkTileScheduler::set_max_num_path_states(int max_num_path_states) +{ + max_num_path_states_ = max_num_path_states; +} + +void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num) +{ + /* Image buffer parameters. */ + image_full_offset_px_.x = buffer_params.full_x; + image_full_offset_px_.y = buffer_params.full_y; + + image_size_px_ = make_int2(buffer_params.width, buffer_params.height); + + offset_ = buffer_params.offset; + stride_ = buffer_params.stride; + + /* Samples parameters. */ + sample_start_ = sample_start; + samples_num_ = samples_num; + + /* Initialize new scheduling. */ + reset_scheduler_state(); +} + +void WorkTileScheduler::reset_scheduler_state() +{ + tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_); + + VLOG(3) << "Will schedule tiles of size " << tile_size_; + + if (VLOG_IS_ON(3)) { + /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling + * and purely focusing on the number of used path states. */ + const int num_path_states_in_tile = tile_size_.width * tile_size_.height * + tile_size_.num_samples; + const int num_tiles = max_num_path_states_ / num_path_states_in_tile; + VLOG(3) << "Number of unused path states: " + << max_num_path_states_ - num_tiles * num_path_states_in_tile; + } + + num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width); + num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height); + + total_tiles_num_ = num_tiles_x_ * num_tiles_y_; + num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples); + + next_work_index_ = 0; + total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_; +} + +bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size) +{ + /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because + * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles, + * improving overall device occupancy. + * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling + * limit". */ + + DCHECK_NE(max_num_path_states_, 0); + + const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1); + if (work_index >= total_work_size_) { + return false; + } + + const int sample_range_index = work_index % num_tiles_per_sample_range_; + const int start_sample = sample_range_index * tile_size_.num_samples; + const int tile_index = work_index / num_tiles_per_sample_range_; + const int tile_y = tile_index / num_tiles_x_; + const int tile_x = tile_index - tile_y * num_tiles_x_; + + KernelWorkTile work_tile; + work_tile.x = tile_x * tile_size_.width; + work_tile.y = tile_y * tile_size_.height; + work_tile.w = tile_size_.width; + work_tile.h = tile_size_.height; + work_tile.start_sample = sample_start_ + start_sample; + work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample); + work_tile.offset = offset_; + work_tile.stride = stride_; + + work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x); + work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y); + + work_tile.x += image_full_offset_px_.x; + work_tile.y += image_full_offset_px_.y; + + const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples; + + DCHECK_GT(tile_work_size, 0); + + if (max_work_size && tile_work_size > max_work_size) { + /* The work did not fit into the requested limit of the work size. Unschedule the tile, + * allowing others (or ourselves later one) to pick it up. + * + * TODO: Such temporary decrement is not ideal, since it might lead to situation when another + * device sees there is nothing to be done, finishing its work and leaving all work to be + * done by us. */ + atomic_fetch_and_add_int32(&next_work_index_, -1); + return false; + } + + *work_tile_ = work_tile; + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h new file mode 100644 index 00000000000..e4c8f701259 --- /dev/null +++ b/intern/cycles/integrator/work_tile_scheduler.h @@ -0,0 +1,98 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "integrator/tile.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; + +struct KernelWorkTile; + +/* Scheduler of device work tiles. + * Takes care of feeding multiple devices running in parallel a work which needs to be done. */ +class WorkTileScheduler { + public: + WorkTileScheduler(); + + /* MAximum path states which are allowed to be used by a single scheduled work tile. + * + * Affects the scheduled work size: the work size will be as big as possible, but will not exceed + * this number of states. */ + void set_max_num_path_states(int max_num_path_states); + + /* Scheduling will happen for pixels within a big tile denotes by its parameters. */ + void reset(const BufferParams &buffer_params, int sample_start, int samples_num); + + /* Get work for a device. + * Returns true if there is still work to be done and initialize the work tile to all + * parameters of this work. If there is nothing remaining to be done, returns false and the + * work tile is kept unchanged. + * + * Optionally pass max_work_size to do nothing if there is no tile small enough. */ + bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0); + + protected: + void reset_scheduler_state(); + + /* Maximum allowed path states to be used. + * + * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the + * number of path states is kind of a detail. Is there a more generic term from the scheduler + * point of view? */ + int max_num_path_states_ = 0; + + /* Offset in pixels within a global buffer. */ + int2 image_full_offset_px_ = make_int2(0, 0); + + /* dimensions of the currently rendering image in pixels. */ + int2 image_size_px_ = make_int2(0, 0); + + /* Offset and stride of the buffer within which scheduing is happenning. + * Will be passed over to the KernelWorkTile. */ + int offset_, stride_; + + /* Start sample of index and number of samples which are to be rendered. + * The scheduler will cover samples range of [start, start + num] over the entire image + * (splitting into a smaller work tiles). */ + int sample_start_ = 0; + int samples_num_ = 0; + + /* Tile size which be scheduled for rendering. */ + TileSize tile_size_; + + /* Number of tiles in X and Y axis of the image. */ + int num_tiles_x_, num_tiles_y_; + + /* Total number of tiles on the image. + * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`. + * + * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value + * in the `get_work()`? */ + int total_tiles_num_ = 0; + + /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes + * how many tiles are to be "stacked" to cover the entire requested range of samples. */ + int num_tiles_per_sample_range_ = 0; + + int next_work_index_ = 0; + int total_work_size_ = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 0ce33c51778..4196539a9b1 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -22,68 +22,22 @@ set(INC_SYS ) -set(SRC_CPU_KERNELS - kernels/cpu/kernel.cpp - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - kernels/cpu/kernel_avx.cpp - kernels/cpu/kernel_avx2.cpp - kernels/cpu/kernel_split.cpp - kernels/cpu/kernel_split_sse2.cpp - kernels/cpu/kernel_split_sse3.cpp - kernels/cpu/kernel_split_sse41.cpp - kernels/cpu/kernel_split_avx.cpp - kernels/cpu/kernel_split_avx2.cpp - kernels/cpu/filter.cpp - kernels/cpu/filter_sse2.cpp - kernels/cpu/filter_sse3.cpp - kernels/cpu/filter_sse41.cpp - kernels/cpu/filter_avx.cpp - kernels/cpu/filter_avx2.cpp +set(SRC_DEVICE_CPU + device/cpu/kernel.cpp + device/cpu/kernel_sse2.cpp + device/cpu/kernel_sse3.cpp + device/cpu/kernel_sse41.cpp + device/cpu/kernel_avx.cpp + device/cpu/kernel_avx2.cpp ) -set(SRC_CUDA_KERNELS - kernels/cuda/kernel.cu - kernels/cuda/kernel_split.cu - kernels/cuda/filter.cu +set(SRC_DEVICE_CUDA + device/cuda/kernel.cu ) -set(SRC_OPENCL_KERNELS - kernels/opencl/kernel_adaptive_stopping.cl - kernels/opencl/kernel_adaptive_filter_x.cl - kernels/opencl/kernel_adaptive_filter_y.cl - kernels/opencl/kernel_adaptive_adjust_samples.cl - kernels/opencl/kernel_bake.cl - kernels/opencl/kernel_base.cl - kernels/opencl/kernel_displace.cl - kernels/opencl/kernel_background.cl - kernels/opencl/kernel_state_buffer_size.cl - kernels/opencl/kernel_split_bundle.cl - kernels/opencl/kernel_data_init.cl - kernels/opencl/kernel_path_init.cl - kernels/opencl/kernel_queue_enqueue.cl - kernels/opencl/kernel_scene_intersect.cl - kernels/opencl/kernel_lamp_emission.cl - kernels/opencl/kernel_do_volume.cl - kernels/opencl/kernel_indirect_background.cl - kernels/opencl/kernel_shader_setup.cl - kernels/opencl/kernel_shader_sort.cl - kernels/opencl/kernel_shader_eval.cl - kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl - kernels/opencl/kernel_subsurface_scatter.cl - kernels/opencl/kernel_direct_lighting.cl - kernels/opencl/kernel_shadow_blocked_ao.cl - kernels/opencl/kernel_shadow_blocked_dl.cl - kernels/opencl/kernel_enqueue_inactive.cl - kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_indirect_subsurface.cl - kernels/opencl/kernel_buffer_update.cl - kernels/opencl/filter.cl -) - -set(SRC_OPTIX_KERNELS - kernels/optix/kernel_optix.cu +set(SRC_DEVICE_OPTIX + device/optix/kernel.cu + device/optix/kernel_shader_raytrace.cu ) set(SRC_BVH_HEADERS @@ -105,63 +59,56 @@ set(SRC_HEADERS kernel_bake.h kernel_camera.h kernel_color.h - kernel_compat_cpu.h - kernel_compat_cuda.h - kernel_compat_optix.h - kernel_compat_opencl.h kernel_differential.h kernel_emission.h kernel_film.h - kernel_globals.h kernel_id_passes.h kernel_jitter.h kernel_light.h kernel_light_background.h kernel_light_common.h + kernel_lookup_table.h kernel_math.h kernel_montecarlo.h kernel_passes.h - kernel_path.h - kernel_path_branched.h - kernel_path_common.h kernel_path_state.h - kernel_path_surface.h - kernel_path_subsurface.h - kernel_path_volume.h kernel_profiling.h kernel_projection.h - kernel_queues.h kernel_random.h kernel_shader.h - kernel_shadow.h - kernel_subsurface.h + kernel_shadow_catcher.h kernel_textures.h kernel_types.h - kernel_volume.h kernel_work_stealing.h kernel_write_passes.h ) -set(SRC_KERNELS_CPU_HEADERS - kernel.h - kernels/cpu/kernel_cpu.h - kernels/cpu/kernel_cpu_impl.h - kernels/cpu/kernel_cpu_image.h - kernels/cpu/filter_cpu.h - kernels/cpu/filter_cpu_impl.h +set(SRC_DEVICE_CPU_HEADERS + device/cpu/compat.h + device/cpu/image.h + device/cpu/globals.h + device/cpu/kernel.h + device/cpu/kernel_arch.h + device/cpu/kernel_arch_impl.h ) - -set(SRC_KERNELS_CUDA_HEADERS - kernels/cuda/kernel_config.h - kernels/cuda/kernel_cuda_image.h +set(SRC_DEVICE_GPU_HEADERS + device/gpu/image.h + device/gpu/kernel.h + device/gpu/parallel_active_index.h + device/gpu/parallel_prefix_sum.h + device/gpu/parallel_reduce.h + device/gpu/parallel_sorted_index.h ) -set(SRC_KERNELS_OPTIX_HEADERS +set(SRC_DEVICE_CUDA_HEADERS + device/cuda/compat.h + device/cuda/config.h + device/cuda/globals.h ) -set(SRC_KERNELS_OPENCL_HEADERS - kernels/opencl/kernel_split_function.h - kernels/opencl/kernel_opencl_image.h +set(SRC_DEVICE_OPTIX_HEADERS + device/optix/compat.h + device/optix/globals.h ) set(SRC_CLOSURE_HEADERS @@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS geom/geom_object.h geom/geom_patch.h geom/geom_primitive.h + geom/geom_shader_data.h geom/geom_subd_triangle.h geom/geom_triangle.h geom/geom_triangle_intersect.h geom/geom_volume.h ) -set(SRC_FILTER_HEADERS - filter/filter.h - filter/filter_defines.h - filter/filter_features.h - filter/filter_features_sse.h - filter/filter_kernel.h - filter/filter_nlm_cpu.h - filter/filter_nlm_gpu.h - filter/filter_prefilter.h - filter/filter_reconstruction.h - filter/filter_transform.h - filter/filter_transform_gpu.h - filter/filter_transform_sse.h +set(SRC_INTEGRATOR_HEADERS + integrator/integrator_init_from_bake.h + integrator/integrator_init_from_camera.h + integrator/integrator_intersect_closest.h + integrator/integrator_intersect_shadow.h + integrator/integrator_intersect_subsurface.h + integrator/integrator_intersect_volume_stack.h + integrator/integrator_megakernel.h + integrator/integrator_shade_background.h + integrator/integrator_shade_light.h + integrator/integrator_shade_shadow.h + integrator/integrator_shade_surface.h + integrator/integrator_shade_volume.h + integrator/integrator_state.h + integrator/integrator_state_flow.h + integrator/integrator_state_template.h + integrator/integrator_state_util.h + integrator/integrator_subsurface.h + integrator/integrator_volume_stack.h ) set(SRC_UTIL_HEADERS @@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS ../util/util_types_vector3_impl.h ) -set(SRC_SPLIT_HEADERS - split/kernel_adaptive_adjust_samples.h - split/kernel_adaptive_filter_x.h - split/kernel_adaptive_filter_y.h - split/kernel_adaptive_stopping.h - split/kernel_branched.h - split/kernel_buffer_update.h - split/kernel_data_init.h - split/kernel_direct_lighting.h - split/kernel_do_volume.h - split/kernel_enqueue_inactive.h - split/kernel_holdout_emission_blurring_pathtermination_ao.h - split/kernel_indirect_background.h - split/kernel_indirect_subsurface.h - split/kernel_lamp_emission.h - split/kernel_next_iteration_setup.h - split/kernel_path_init.h - split/kernel_queue_enqueue.h - split/kernel_scene_intersect.h - split/kernel_shader_setup.h - split/kernel_shader_sort.h - split/kernel_shader_eval.h - split/kernel_shadow_blocked_ao.h - split/kernel_shadow_blocked_dl.h - split/kernel_split_common.h - split/kernel_split_data.h - split/kernel_split_data_types.h - split/kernel_subsurface_scatter.h -) - set(LIB ) @@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu + set(cuda_sources device/cuda/kernel.cu ${SRC_HEADERS} - ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_DEVICE_GPU_HEADERS} + ${SRC_DEVICE_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) - set(cuda_filter_sources kernels/cuda/filter.cu - ${SRC_HEADERS} - ${SRC_KERNELS_CUDA_HEADERS} - ${SRC_FILTER_HEADERS} - ${SRC_UTIL_HEADERS} - ) set(cuda_cubins) macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental) @@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() endif() - set(cuda_kernel_src "/kernels/cuda/${name}.cu") + set(cuda_kernel_src "/device/cuda/${name}.cu") set(cuda_flags ${flags} -D CCL_NAMESPACE_BEGIN= @@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES) -D NVCC -m ${CUDA_BITS} -I ${CMAKE_CURRENT_SOURCE_DIR}/.. - -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda + -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda --use_fast_math -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}) @@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE) CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE) - if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) - # Compile split kernel - CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE) - endif() - if(WITH_CYCLES_CUDA_BUILD_SERIAL) set(prev_arch ${arch}) endif() @@ -547,15 +461,15 @@ endif() # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) - macro(CYCLES_OPTIX_KERNEL_ADD name flags) - set(input "kernels/optix/kernel_optix.cu") + macro(CYCLES_OPTIX_KERNEL_ADD name input flags) set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx") set(cuda_flags ${flags} -I "${OPTIX_INCLUDE_DIR}" -I "${CMAKE_CURRENT_SOURCE_DIR}/.." - -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda" + -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda" --use_fast_math + -Wno-deprecated-gpu-targets -o ${output}) if(WITH_NANOVDB) @@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) DEPENDS ${input} ${SRC_HEADERS} - ${SRC_KERNELS_CUDA_HEADERS} - ${SRC_KERNELS_OPTIX_HEADERS} + ${SRC_DEVICE_GPU_HEADERS} + ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} COMMAND ${CUBIN_CC_ENV} @@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) DEPENDS ${input} ${SRC_HEADERS} - ${SRC_KERNELS_CUDA_HEADERS} - ${SRC_KERNELS_OPTIX_HEADERS} + ${SRC_DEVICE_GPU_HEADERS} + ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} COMMAND @@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib) endmacro() - CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__") - CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions") + CYCLES_OPTIX_KERNEL_ADD( + kernel_optix + "device/optix/kernel.cu" + "") + CYCLES_OPTIX_KERNEL_ADD( + kernel_optix_shader_raytrace + "device/optix/kernel_shader_raytrace.cu" + "--keep-device-functions") add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx}) cycles_set_solution_folder(cycles_kernel_optix) @@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN) endif() endif() -set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") -set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") -set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") if(CXX_HAS_SSE) - set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") - set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() cycles_add_library(cycles_kernel "${LIB}" - ${SRC_CPU_KERNELS} - ${SRC_CUDA_KERNELS} - ${SRC_OPTIX_KERNELS} - ${SRC_OPENCL_KERNELS} + ${SRC_DEVICE_CPU} + ${SRC_DEVICE_CUDA} + ${SRC_DEVICE_OPTIX} ${SRC_HEADERS} - ${SRC_KERNELS_CPU_HEADERS} - ${SRC_KERNELS_CUDA_HEADERS} - ${SRC_KERNELS_OPTIX_HEADERS} - ${SRC_KERNELS_OPENCL_HEADERS} + ${SRC_DEVICE_CPU_HEADERS} + ${SRC_DEVICE_GPU_HEADERS} + ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} - ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} - ${SRC_SPLIT_HEADERS} + ${SRC_INTEGRATOR_HEADERS} ) source_group("bvh" FILES ${SRC_BVH_HEADERS}) source_group("closure" FILES ${SRC_CLOSURE_HEADERS}) -source_group("filter" FILES ${SRC_FILTER_HEADERS}) source_group("geom" FILES ${SRC_GEOM_HEADERS}) +source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS}) source_group("kernel" FILES ${SRC_HEADERS}) -source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS}) -source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS}) -source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS}) -source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS}) -source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS}) +source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS}) +source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS}) +source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS}) +source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS}) source_group("svm" FILES ${SRC_SVM_HEADERS}) if(WITH_CYCLES_CUDA) @@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) add_dependencies(cycles_kernel cycles_kernel_optix) endif() -# OpenCL kernel - -# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl) -# add_custom_command( -# OUTPUT ${KERNEL_PREPROCESSED} -# COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED} -# DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS}) -# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) -# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) +# Install kernel source for runtime compilation -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split) - if(WITH_NANOVDB) set(SRC_NANOVDB_HEADERS diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index acf29cf1baf..539e9fd05fb 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -25,6 +25,8 @@ * the code has been extended and modified to support more primitives and work * with CPU/CUDA/OpenCL. */ +#pragma once + #ifdef __EMBREE__ # include "kernel/bvh/bvh_embree.h" #endif @@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray) return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f; } -ccl_device_intersect bool scene_intersect(KernelGlobals *kg, +ccl_device_intersect bool scene_intersect(const KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) { - PROFILING_INIT(kg, PROFILING_INTERSECT); - #ifdef __KERNEL_OPTIX__ uint p0 = 0; uint p1 = 0; @@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, } #ifdef __BVH_LOCAL__ -ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, +ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg, const Ray *ray, LocalIntersection *local_isect, int local_object, uint *lcg_state, int max_hits) { - PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL); - # ifdef __KERNEL_OPTIX__ uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF; uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF; @@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, float3 dir = ray->D; float3 idir = ray->D; Transform ob_itfm; - rtc_ray.tfar = bvh_instance_motion_push( - kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm); + rtc_ray.tfar = ray->t * + bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm); /* bvh_instance_motion_push() returns the inverse transform but * it's not needed here. */ (void)ob_itfm; @@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, #endif #ifdef __SHADOW_RECORD_ALL__ -ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, +ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg, const Ray *ray, Intersection *isect, uint visibility, uint max_hits, uint *num_hits) { - PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL); - # ifdef __KERNEL_OPTIX__ uint p0 = ((uint64_t)isect) & 0xFFFFFFFF; uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF; @@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); ctx.isect_s = isect; ctx.max_hits = max_hits; - ctx.num_hits = 0; IntersectContext rtc_ctx(&ctx); RTCRay rtc_ray; kernel_embree_setup_ray(*ray, rtc_ray, visibility); rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); - if (ctx.num_hits > max_hits) { - return true; - } *num_hits = ctx.num_hits; - return rtc_ray.tfar == -INFINITY; + return ctx.opaque_hit; } # endif /* __EMBREE__ */ @@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, #endif /* __SHADOW_RECORD_ALL__ */ #ifdef __VOLUME__ -ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, +ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility) { - PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME); - # ifdef __KERNEL_OPTIX__ uint p0 = 0; uint p1 = 0; @@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, #endif /* __VOLUME__ */ #ifdef __VOLUME_RECORD_ALL__ -ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, +ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint max_hits, const uint visibility) { - PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL); - if (!scene_intersect_valid(ray)) { return false; } diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h index 4605c3ea51d..092d770dcac 100644 --- a/intern/cycles/kernel/bvh/bvh_embree.h +++ b/intern/cycles/kernel/bvh/bvh_embree.h @@ -14,14 +14,13 @@ * limitations under the License. */ +#pragma once + #include <embree3/rtcore_ray.h> #include <embree3/rtcore_scene.h> -// clang-format off -#include "kernel/kernel_compat_cpu.h" -#include "kernel/split/kernel_split_data_types.h" -#include "kernel/kernel_globals.h" -// clang-format on +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" #include "util/util_vector.h" @@ -36,25 +35,29 @@ struct CCLIntersectContext { RAY_VOLUME_ALL = 4, } RayType; - KernelGlobals *kg; + const KernelGlobals *kg; RayType type; /* for shadow rays */ Intersection *isect_s; int max_hits; int num_hits; + float max_t; + bool opaque_hit; /* for SSS Rays: */ LocalIntersection *local_isect; int local_object_id; uint *lcg_state; - CCLIntersectContext(KernelGlobals *kg_, RayType type_) + CCLIntersectContext(const KernelGlobals *kg_, RayType type_) { kg = kg_; type = type_; max_hits = 1; num_hits = 0; + max_t = FLT_MAX; + opaque_hit = false; isect_s = NULL; local_isect = NULL; local_object_id = -1; @@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray, rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID; } -ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, +ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect) @@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, isect->type = kernel_tex_fetch(__prim_type, isect->prim); } -ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg, +ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 4006c9c1632..90b9f410b29 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -36,7 +36,7 @@ ccl_device #else ccl_device_inline #endif - bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg, const Ray *ray, LocalIntersection *local_isect, int local_object, @@ -74,9 +74,9 @@ ccl_device_inline if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm); #else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); + isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir); #endif object = local_object; } @@ -196,7 +196,7 @@ ccl_device_inline return false; } -ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, +ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg, const Ray *ray, LocalIntersection *local_isect, int local_object, diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 5367bdb633c..15cd0f22213 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -16,7 +16,7 @@ // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and // 3-vector which might be faster. -ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, +ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg, int node_addr, int child) { @@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k return space; } -ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg, const float3 P, const float3 idir, const float t, @@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, #endif } -ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg, +ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg, const float3 P, const float3 dir, const float t, @@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg return tnear <= tfar; } -ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, @@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, return mask; } -ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, +ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg, const float3 P, const float3 dir, const float3 idir, diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 2e94b1d7c37..0ae36fccf9b 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -36,7 +36,7 @@ ccl_device #else ccl_device_inline #endif - bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint visibility, @@ -68,10 +68,10 @@ ccl_device_inline Transform ob_itfm; #endif - int num_hits_in_instance = 0; + float t_world_to_instance = 1.0f; *num_hits = 0; - isect_array->t = tmax; + Intersection *isect = isect_array; /* traversal loop */ do { @@ -147,13 +147,14 @@ ccl_device_inline switch (p_type) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + hit = triangle_intersect( + kg, isect, P, dir, isect_t, visibility, object, prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) case PRIMITIVE_MOTION_TRIANGLE: { hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr); break; } #endif @@ -163,8 +164,16 @@ ccl_device_inline case PRIMITIVE_CURVE_RIBBON: case PRIMITIVE_MOTION_CURVE_RIBBON: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - hit = curve_intersect( - kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type); + hit = curve_intersect(kg, + isect, + P, + dir, + isect_t, + visibility, + object, + prim_addr, + ray->time, + curve_type); break; } #endif @@ -176,27 +185,49 @@ ccl_device_inline /* shadow ray early termination */ if (hit) { + /* Convert intersection distance to world space. */ + isect->t /= t_world_to_instance; + /* detect if this surface has a shader with transparent shadows */ /* todo: optimize so primitive visibility flag indicates if * the primitive has a transparent shadow shader? */ - const int flags = intersection_get_shader_flags(kg, isect_array); + const int flags = intersection_get_shader_flags(kg, isect); - /* if no transparent shadows, all light is blocked */ - if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if (*num_hits == max_hits) { + if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) { + /* If no transparent shadows, all light is blocked and we can + * stop immediately. */ return true; } - /* move on to next entry in intersections array */ - isect_array++; + /* Increase the number of hits, possibly beyond max_hits, we will + * simply not record those and only keep the max_hits closest. */ (*num_hits)++; - num_hits_in_instance++; - isect_array->t = isect_t; + if (*num_hits >= max_hits) { + /* If maximum number of hits reached, find the intersection with + * the largest distance to potentially replace when another hit + * is found. */ + const int num_recorded_hits = min(max_hits, *num_hits); + float max_recorded_t = isect_array[0].t; + int max_recorded_hit = 0; + + for (int i = 1; i < num_recorded_hits; i++) { + if (isect_array[i].t > max_recorded_t) { + max_recorded_t = isect_array[i].t; + max_recorded_hit = i; + } + } + + isect = isect_array + max_recorded_hit; + + /* Limit the ray distance and stop counting hits beyond this. */ + isect_t = max_recorded_t * t_world_to_instance; + } + else { + /* Still have space for intersection, use next hit. */ + isect = isect + 1; + } } prim_addr++; @@ -207,13 +238,14 @@ ccl_device_inline object = kernel_tex_fetch(__prim_object, -prim_addr - 1); #if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + t_world_to_instance = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif - num_hits_in_instance = 0; - isect_array->t = isect_t; + /* Convert intersection to object space. */ + isect_t *= t_world_to_instance; ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); @@ -228,32 +260,19 @@ ccl_device_inline kernel_assert(object != OBJECT_NONE); /* Instance pop. */ - if (num_hits_in_instance) { - float t_fac; - #if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); #else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); #endif - /* scale isect->t to adjust for instancing */ - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } - else { -#if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -#else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -#endif - } - - isect_t = tmax; - isect_array->t = isect_t; + /* Restore world space ray length. If max number of hits exceeded this + * distance is reduced to recorded only the closest hits. If not use + * the original ray length. */ + isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax; object = OBJECT_NONE; + t_world_to_instance = 1.0f; node_addr = traversal_stack[stack_ptr]; --stack_ptr; } @@ -262,7 +281,7 @@ ccl_device_inline return false; } -ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, +ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint visibility, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 89250a8d60a..a26d8c514f3 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -31,7 +31,7 @@ * BVH_MOTION: motion blur rendering */ -ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, +ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility) @@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, case PRIMITIVE_TRIANGLE: { for (; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { + if (triangle_intersect( + kg, isect, P, dir, isect->t, visibility, object, prim_addr)) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; @@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, for (; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if (motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { + kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; @@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, for (; prim_addr < prim_addr2; prim_addr++) { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - const bool hit = curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); + const bool hit = curve_intersect(kg, + isect, + P, + dir, + isect->t, + visibility, + object, + prim_addr, + ray->time, + curve_type); if (hit) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) @@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr - 1); #if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif ++stack_ptr; @@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, return (isect->prim != PRIM_NONE); } -ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, +ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility) diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index 98e6ec25d15..6039e707fc3 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __BVH_TYPES__ -#define __BVH_TYPES__ +#pragma once CCL_NAMESPACE_BEGIN @@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0) CCL_NAMESPACE_END - -#endif /* __BVH_TYPES__ */ diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h index b1faebce957..21384457b16 100644 --- a/intern/cycles/kernel/bvh/bvh_util.h +++ b/intern/cycles/kernel/bvh/bvh_util.h @@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } -/* This function should be used to compute a modified ray start position for - * rays leaving from a surface. The algorithm slightly distorts flat surface - * of a triangle. Surface is lifted by amount h along normal n in the incident - * point. */ - -ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng) -{ - float3 V[3], N[3]; - triangle_vertices_and_normals(kg, sd->prim, V, N); - - const float u = sd->u, v = sd->v; - const float w = 1 - u - v; - float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */ - float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */ - - object_normal_transform(kg, sd, &n); /* Normal x scale, world space */ - - /* Parabolic approximation */ - float a = dot(N[2] - N[0], V[0] - V[2]); - float b = dot(N[2] - N[1], V[1] - V[2]); - float c = dot(N[1] - N[0], V[1] - V[0]); - float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1); - - /* Check flipped normals */ - if (dot(n, Ng) > 0) { - /* Local linear envelope */ - float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f); - float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f); - float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f); - h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f); - h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f); - h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f); - h = max(min(min(h0, h1), h2), h * 0.5f); - } - else { - float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f); - float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f); - float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f); - h0 = max(dot(P - V[0], N[0]) + h0, 0.0f); - h1 = max(dot(P - V[1], N[1]) + h1, 0.0f); - h2 = max(dot(P - V[2], N[2]) + h2, 0.0f); - h = min(-min(min(h0, h1), h2), h * 0.5f); - } - - return n * h; -} - -/* Ray offset to avoid shadow terminator artifact. */ - -ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L) -{ - float NL = dot(sd->N, L); - bool transmit = (NL < 0.0f); - float3 Ng = (transmit ? -sd->Ng : sd->Ng); - float3 P = ray_offset(sd->P, Ng); - - if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) { - const float offset_cutoff = - kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset; - /* Do ray offset (heavy stuff) only for close to be terminated triangles: - * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also - * make a smooth transition near the threshold. */ - if (offset_cutoff > 0.0f) { - float NgL = dot(Ng, L); - float offset_amount = 0.0f; - if (NL < offset_cutoff) { - offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f); - } - else { - offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f); - } - if (offset_amount > 0.0f) { - P += smooth_surface_offset(kg, sd, Ng) * offset_amount; - } - } - } - - return P; -} - #if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__)) /* ToDo: Move to another file? */ ccl_device int intersections_compare(const void *a, const void *b) @@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits) } #endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ -/* Utility to quickly get a shader flags from an intersection. */ +/* Utility to quickly get flags from an intersection. */ -ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg, - const Intersection *isect) +ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg, + const Intersection *ccl_restrict isect) { const int prim = kernel_tex_fetch(__prim_index, isect->prim); int shader = 0; @@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; } -ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg, - const Intersection *isect) +ccl_device_forceinline int intersection_get_shader_from_isect_prim( + const KernelGlobals *ccl_restrict kg, const int isect_prim) { - const int prim = kernel_tex_fetch(__prim_index, isect->prim); + const int prim = kernel_tex_fetch(__prim_index, isect_prim); int shader = 0; #ifdef __HAIR__ - if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) + if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE) #endif { shader = kernel_tex_fetch(__tri_shader, prim); @@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k return shader & SHADER_MASK; } -ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg, +ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg, + const Intersection *ccl_restrict isect) +{ + return intersection_get_shader_from_isect_prim(kg, isect->prim); +} + +ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg, const Intersection *ccl_restrict isect) { if (isect->object != OBJECT_NONE) { @@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k return kernel_tex_fetch(__prim_object, isect->prim); } +ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg, + const Intersection *ccl_restrict isect) +{ + const int object = intersection_get_object(kg, isect); + + return kernel_tex_fetch(__object_flag, object); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 1f2ea47269b..0411d9c522d 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -35,7 +35,7 @@ ccl_device #else ccl_device_inline #endif - bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility) @@ -147,7 +147,7 @@ ccl_device_inline if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); + triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr); } break; } @@ -165,7 +165,7 @@ ccl_device_inline continue; } motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr); + kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr); } break; } @@ -181,10 +181,9 @@ ccl_device_inline int object_flag = kernel_tex_fetch(__object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { #if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); + isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); + isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif ++stack_ptr; @@ -222,7 +221,7 @@ ccl_device_inline return (isect->prim != PRIM_NONE); } -ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, +ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility) diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index a8664cc4331..4874270f15d 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -35,7 +35,7 @@ ccl_device #else ccl_device_inline #endif - uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, + uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, @@ -150,7 +150,8 @@ ccl_device_inline if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); + hit = triangle_intersect( + kg, isect_array, P, dir, isect_t, visibility, object, prim_addr); if (hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -190,7 +191,7 @@ ccl_device_inline continue; } hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); + kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr); if (hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -228,10 +229,9 @@ ccl_device_inline int object_flag = kernel_tex_fetch(__object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { #if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); + isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); + isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif num_hits_in_instance = 0; @@ -289,7 +289,7 @@ ccl_device_inline return num_hits; } -ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, +ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index 99a5a675976..72a8c2ba090 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 6f2f2ebb202..4eb8bcae997 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + // clang-format off #include "kernel/closure/bsdf_ashikhmin_velvet.h" #include "kernel/closure/bsdf_diffuse.h" @@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl return val; } -ccl_device_inline int bsdf_sample(KernelGlobals *kg, +ccl_device_inline int bsdf_sample(const KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, @@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; # endif /* __PRINCIPLED__ */ #endif -#ifdef __VOLUME__ - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, - sd->I, - sd->dI.dx, - sd->dI.dy, - randu, - randv, - eval, - omega_in, - &domega_in->dx, - &domega_in->dy, - pdf); - break; -#endif default: label = LABEL_NONE; break; @@ -482,15 +469,16 @@ ccl_device ccl_device_inline #endif float3 - bsdf_eval(KernelGlobals *kg, + bsdf_eval(const KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, const float3 omega_in, + const bool is_transmission, float *pdf) { - float3 eval; + float3 eval = zero_float3(); - if (dot(sd->N, omega_in) >= 0.0f) { + if (!is_transmission) { switch (sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: @@ -570,13 +558,7 @@ ccl_device_inline break; # endif /* __PRINCIPLED__ */ #endif -#ifdef __VOLUME__ - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); - break; -#endif default: - eval = make_float3(0.0f, 0.0f, 0.0f); break; } if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { @@ -663,13 +645,7 @@ ccl_device_inline break; # endif /* __PRINCIPLED__ */ #endif -#ifdef __VOLUME__ - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); - break; -#endif default: - eval = make_float3(0.0f, 0.0f, 0.0f); break; } if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { @@ -682,7 +658,7 @@ ccl_device_inline return eval; } -ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) +ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness) { /* ToDo: do we want to blur volume closures? */ #ifdef __SVM__ @@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) #endif } -ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) -{ -#ifdef __SVM__ - switch (a->type) { - case CLOSURE_BSDF_TRANSPARENT_ID: - return true; - case CLOSURE_BSDF_DIFFUSE_ID: - case CLOSURE_BSDF_BSSRDF_ID: - case CLOSURE_BSDF_TRANSLUCENT_ID: - return bsdf_diffuse_merge(a, b); - case CLOSURE_BSDF_OREN_NAYAR_ID: - return bsdf_oren_nayar_merge(a, b); - case CLOSURE_BSDF_REFLECTION_ID: - case CLOSURE_BSDF_REFRACTION_ID: - case CLOSURE_BSDF_MICROFACET_GGX_ID: - case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: - case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: - case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: - case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: - case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: - case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: - return bsdf_microfacet_merge(a, b); - case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - return bsdf_ashikhmin_velvet_merge(a, b); - case CLOSURE_BSDF_DIFFUSE_TOON_ID: - case CLOSURE_BSDF_GLOSSY_TOON_ID: - return bsdf_toon_merge(a, b); - case CLOSURE_BSDF_HAIR_REFLECTION_ID: - case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - return bsdf_hair_merge(a, b); -# ifdef __PRINCIPLED__ - case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: - case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: - return bsdf_principled_diffuse_merge(a, b); -# endif -# ifdef __VOLUME__ - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - return volume_henyey_greenstein_merge(a, b); -# endif - default: - return false; - } -#else - return false; -#endif -} - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index 9814a7cf5c9..be6383e521a 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -14,20 +14,19 @@ * limitations under the License. */ -#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__ -#define __BSDF_ASHIKHMIN_SHIRLEY_H__ - /* -ASHIKHMIN SHIRLEY BSDF - -Implementation of -Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000) - -The Fresnel factor is missing to get a separable bsdf (intensity*color), as is -the case with all other microfacet-based BSDF implementations in Cycles. + * ASHIKHMIN SHIRLEY BSDF + * + * Implementation of + * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000) + * + * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is + * the case with all other microfacet-based BSDF implementations in Cycles. + * + * Other than that, the implementation directly follows the paper. + */ -Other than that, the implementation directly follows the paper. -*/ +#pragma once CCL_NAMESPACE_BEGIN @@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 3d3f20edab3..f51027f5701 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -30,8 +30,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_ASHIKHMIN_VELVET_H__ -#define __BSDF_ASHIKHMIN_VELVET_H__ +#pragma once + +#include "kernel/kernel_montecarlo.h" CCL_NAMESPACE_BEGIN @@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a; - const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma); -} - ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, @@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index ea604ed0311..1555aa30304 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_DIFFUSE_H__ -#define __BSDF_DIFFUSE_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a; - const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N)); -} - ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, @@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_DIFFUSE_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index aa62c1c7ceb..b06dd196b9e 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_DIFFUSE_RAMP_H__ -#define __BSDF_DIFFUSE_RAMP_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, #endif /* __OSL__ */ CCL_NAMESPACE_END - -#endif /* __BSDF_DIFFUSE_RAMP_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h index 7ca9424b815..f56f78aa1f0 100644 --- a/intern/cycles/kernel/closure/bsdf_hair.h +++ b/intern/cycles/kernel/closure/bsdf_hair.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_HAIR_H__ -#define __BSDF_HAIR_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const HairBsdf *bsdf_a = (const HairBsdf *)a; - const HairBsdf *bsdf_b = (const HairBsdf *)b; - - return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) && - (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset); -} - ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, @@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_HAIR_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h index f12661b3095..bfe56e5ab0e 100644 --- a/intern/cycles/kernel/closure/bsdf_hair_principled.h +++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h @@ -14,15 +14,14 @@ * limitations under the License. */ +#pragma once + #ifdef __KERNEL_CPU__ # include <fenv.h> #endif #include "kernel/kernel_color.h" -#ifndef __BSDF_HAIR_PRINCIPLED_H__ -# define __BSDF_HAIR_PRINCIPLED_H__ - CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PrincipledHairExtra { @@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering( } /* Combine the three values using their luminances. */ -ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c) +ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c) { return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c)); } -# ifdef __HAIR__ +#ifdef __HAIR__ /* Set up the hair closure. */ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf) { @@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG; } -# endif /* __HAIR__ */ +#endif /* __HAIR__ */ /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */ -ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap) +ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap) { /* Primary specular (R). */ Ap[0] = make_float4(f, f, f, f); @@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i, } /* Evaluation function for our shader. */ -ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg, +ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg, const ShaderData *sd, const ShaderClosure *sc, const float3 omega_in, @@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg, } /* Sampling function for the hair shader. */ -ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg, +ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg, const ShaderClosure *sc, ShaderData *sd, float randu, @@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg, *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i); -# ifdef __RAY_DIFFERENTIALS__ +#ifdef __RAY_DIFFERENTIALS__ float3 N = safe_normalize(sd->I + *omega_in); *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx; *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy; -# endif +#endif return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT); } @@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale( return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f; } -ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc) +ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc) { PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc; return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v)); @@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo } CCL_NAMESPACE_END - -#endif /* __BSDF_HAIR_PRINCIPLED_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index af03bab39f7..227cb448b47 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -30,8 +30,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_MICROFACET_H__ -#define __BSDF_MICROFACET_H__ +#pragma once + +#include "kernel/kernel_lookup_table.h" +#include "kernel/kernel_random.h" CCL_NAMESPACE_BEGIN @@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i /* Beckmann and GGX microfacet importance sampling. */ -ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg, +ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg, const float cos_theta_i, const float sin_theta_i, float randu, @@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i, *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x)); } -ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg, +ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg, const float3 omega_i, const float alpha_x, const float alpha_y, @@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a; - const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) && - (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) && - (bsdf_a->ior == bsdf_b->ior) && - ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) || - ((bsdf_a->extra && bsdf_b->extra) && - (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) && - (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) && - (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat))); -} - ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf) { bsdf->extra = NULL; @@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, +ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, @@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc return make_float3(out, out, out); } -ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, +ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, @@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, } CCL_NAMESPACE_END - -#endif /* __BSDF_MICROFACET_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 9795c8da065..68d5071dbce 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Most of the code is based on the supplemental implementations from @@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc bsdf->extra->cspec0); } -ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, +ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, @@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu bsdf->extra->cspec0); } -ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, +ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index 41e5736bf49..be12d47f0ea 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __BSDF_OREN_NAYAR_H__ -#define __BSDF_OREN_NAYAR_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a; - const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness); -} - ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, @@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_OREN_NAYAR_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index cf5484383f2..43f8cf71c59 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_PHONG_RAMP_H__ -#define __BSDF_PHONG_RAMP_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, #endif /* __OSL__ */ CCL_NAMESPACE_END - -#endif /* __BSDF_PHONG_RAMP_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h index d5d012068ff..a72af519482 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -14,14 +14,15 @@ * limitations under the License. */ -#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__ -#define __BSDF_PRINCIPLED_DIFFUSE_H__ +#pragma once /* DISNEY PRINCIPLED DIFFUSE BRDF * * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) */ +#include "kernel/closure/bsdf_util.h" + CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PrincipledDiffuseBsdf { @@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a; - const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); -} - ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, @@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h index 3707de29d73..60ce7e4eb75 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -14,14 +14,15 @@ * limitations under the License. */ -#ifndef __BSDF_PRINCIPLED_SHEEN_H__ -#define __BSDF_PRINCIPLED_SHEEN_H__ +#pragma once /* DISNEY PRINCIPLED SHEEN BRDF * * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) */ +#include "kernel/closure/bsdf_util.h" + CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PrincipledSheenBsdf { @@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h index c24ba170915..31283971d5a 100644 --- a/intern/cycles/kernel/closure/bsdf_reflection.h +++ b/intern/cycles/kernel/closure/bsdf_reflection.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_REFLECTION_H__ -#define __BSDF_REFLECTION_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_REFLECTION_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h index d4fbe86dac0..cfedb5dfe2c 100644 --- a/intern/cycles/kernel/closure/bsdf_refraction.h +++ b/intern/cycles/kernel/closure/bsdf_refraction.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_REFRACTION_H__ -#define __BSDF_REFRACTION_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_REFRACTION_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index cc5de21ed0e..acdafe0f735 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_TOON_H__ -#define __BSDF_TOON_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf) return SD_BSDF | SD_BSDF_HAS_EVAL; } -ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const ToonBsdf *bsdf_a = (const ToonBsdf *)a; - const ToonBsdf *bsdf_b = (const ToonBsdf *)b; - - return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) && - (bsdf_a->smooth == bsdf_b->smooth); -} - ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle) { float is; @@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_TOON_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h index 4e5513499e8..f1dc7efb345 100644 --- a/intern/cycles/kernel/closure/bsdf_transparent.h +++ b/intern/cycles/kernel/closure/bsdf_transparent.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_TRANSPARENT_H__ -#define __BSDF_TRANSPARENT_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc, } CCL_NAMESPACE_END - -#endif /* __BSDF_TRANSPARENT_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index a73dee1b045..beec5f768a1 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __BSDF_UTIL_H__ -#define __BSDF_UTIL_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0 } CCL_NAMESPACE_END - -#endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index 562daf1286d..0f9278bba89 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_BSSRDF_H__ -#define __KERNEL_BSSRDF_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf { float3 radius; float3 albedo; - float sharpness; - float texture_blur; float roughness; - float channels; + float anisotropy; } Bssrdf; static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!"); -/* Planar Truncated Gaussian - * - * Note how this is different from the typical gaussian, this one integrates - * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky - * that integrating x*exp(-x) gives a nice closed form solution. */ - -/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */ -#define GAUSS_TRUNCATE 12.46f - -ccl_device float bssrdf_gaussian_eval(const float radius, float r) -{ - /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm - * = 1 - exp(-Rm*Rm/(2*v)) */ - const float v = radius * radius * (0.25f * 0.25f); - const float Rm = sqrtf(v * GAUSS_TRUNCATE); - - if (r >= Rm) - return 0.0f; - - return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v); -} - -ccl_device float bssrdf_gaussian_pdf(const float radius, float r) +ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA) { - /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */ - const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE); - - return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated)); + float s = sqrtf(3.0f * (1.0f - alpha_prime)); + return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s); } -ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h) +ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA) { - /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v)) - * r = sqrt(-2*v*logf(xi)) */ - const float v = radius * radius * (0.25f * 0.25f); - const float Rm = sqrtf(v * GAUSS_TRUNCATE); - - /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */ - const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE); - - /* r(xi) */ - const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated); - *r = sqrtf(r_squared); - - /* h^2 + r^2 = Rm^2 */ - *h = safe_sqrtf(Rm * Rm - r_squared); -} - -/* Planar Cubic BSSRDF falloff - * - * This is basically (Rm - x)^3, with some factors to normalize it. For sampling - * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as - * far as I can tell has no closed form solution. So we get an iterative solution - * instead with newton-raphson. */ - -ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r) -{ - if (sharpness == 0.0f) { - const float Rm = radius; - - if (r >= Rm) - return 0.0f; - - /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */ - const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm; - const float f = Rm - r; - const float num = f * f * f; - - return (10.0f * num) / (Rm5 * M_PI_F); + /* Little Newton solver. */ + if (rd < 1e-4f) { + return 0.0f; + } + if (rd >= 0.995f) { + return 0.999999f; } - else { - float Rm = radius * (1.0f + sharpness); - - if (r >= Rm) - return 0.0f; - /* custom variation with extra sharpness, to match the previous code */ - const float y = 1.0f / (1.0f + sharpness); - float Rmy, ry, ryinv; + float x0 = 0.0f; + float x1 = 1.0f; + float xmid, fmid; - if (sharpness == 1.0f) { - Rmy = sqrtf(Rm); - ry = sqrtf(r); - ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f; + constexpr const int max_num_iterations = 12; + for (int i = 0; i < max_num_iterations; ++i) { + xmid = 0.5f * (x0 + x1); + fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA); + if (fmid < rd) { + x0 = xmid; } else { - Rmy = powf(Rm, y); - ry = powf(r, y); - ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f; + x1 = xmid; } - - const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy; - const float f = Rmy - ry; - const float num = f * (f * f) * (y * ryinv); - - return (10.0f * num) / (Rmy5 * M_PI_F); - } -} - -ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r) -{ - return bssrdf_cubic_eval(radius, sharpness, r); -} - -/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ -ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi) -{ - /* newton-raphson iteration, usually succeeds in 2-4 iterations, except - * outside 0.02 ... 0.98 where it can go up to 10, so overall performance - * should not be too bad */ - const float tolerance = 1e-6f; - const int max_iteration_count = 10; - float x = 0.25f; - int i; - - for (i = 0; i < max_iteration_count; i++) { - float x2 = x * x; - float x3 = x2 * x; - float nx = (1.0f - x); - - float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi; - float f_ = 20.0f * (x * nx) * (nx * nx); - - if (fabsf(f) < tolerance || f_ == 0.0f) - break; - - x = saturate(x - f / f_); } - return x; + return xmid; } -ccl_device void bssrdf_cubic_sample( - const float radius, const float sharpness, float xi, float *r, float *h) +ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta) { - float Rm = radius; - float r_ = bssrdf_cubic_quintic_root_find(xi); - - if (sharpness != 0.0f) { - r_ = powf(r_, 1.0f + sharpness); - Rm *= (1.0f + sharpness); - } - - r_ *= Rm; - *r = r_; - - /* h^2 + r^2 = Rm^2 */ - *h = safe_sqrtf(Rm * Rm - r_ * r_); -} - -/* Approximate Reflectance Profiles - * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf - */ - -/* This is a bit arbitrary, just need big enough radius so it matches - * the mean free length, but still not too big so sampling is still - * effective. Might need some further tweaks. - */ -#define BURLEY_TRUNCATE 16.0f -#define BURLEY_TRUNCATE_CDF 0.9963790093708328f // cdf(BURLEY_TRUNCATE) - -ccl_device_inline float bssrdf_burley_fitting(float A) -{ - /* Diffuse surface transmission, equation (6). */ - return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f); -} - -/* Scale mean free path length so it gives similar looking result - * to Cubic and Gaussian models. - */ -ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r) -{ - return 0.25f * M_1_PI_F * r; -} - -ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf) -{ - /* Mean free path length. */ - const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius); - /* Surface albedo. */ - const float3 A = bssrdf->albedo; - const float3 s = make_float3( - bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z)); - - bssrdf->radius = l / s; -} - -ccl_device float bssrdf_burley_eval(const float d, float r) -{ - const float Rm = BURLEY_TRUNCATE * d; - - if (r >= Rm) - return 0.0f; - - /* Burley reflectance profile, equation (3). - * - * NOTES: - * - Surface albedo is already included into sc->weight, no need to - * multiply by this term here. - * - This is normalized diffuse model, so the equation is multiplied - * by 2*pi, which also matches cdf(). - */ - float exp_r_3_d = expf(-r / (3.0f * d)); - float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d; - return (exp_r_d + exp_r_3_d) / (4.0f * d); -} - -ccl_device float bssrdf_burley_pdf(const float d, float r) -{ - return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF); -} - -/* Find the radius for desired CDF value. - * Returns scaled radius, meaning the result is to be scaled up by d. - * Since there's no closed form solution we do Newton-Raphson method to find it. - */ -ccl_device_forceinline float bssrdf_burley_root_find(float xi) -{ - const float tolerance = 1e-6f; - const int max_iteration_count = 10; - /* Do initial guess based on manual curve fitting, this allows us to reduce - * number of iterations to maximum 4 across the [0..1] range. We keep maximum - * number of iteration higher just to be sure we didn't miss root in some - * corner case. - */ - float r; - if (xi <= 0.9f) { - r = expf(xi * xi * 2.4f) - 1.0f; + if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) { + /* Scale mean free path length so it gives similar looking result to older + * Cubic, Gaussian and Burley models. */ + bssrdf->radius *= 0.25f * M_1_PI_F; } else { - /* TODO(sergey): Some nicer curve fit is possible here. */ - r = 15.0f; - } - /* Solve against scaled radius. */ - for (int i = 0; i < max_iteration_count; i++) { - float exp_r_3 = expf(-r / 3.0f); - float exp_r = exp_r_3 * exp_r_3 * exp_r_3; - float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi; - float f_ = 0.25f * exp_r + 0.25f * exp_r_3; + /* Adjust radius based on IOR and albedo. */ + const float inv_eta = 1.0f / eta; + const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta; + const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) / + (1.0f - F_dr); /* From Jensen's Fdr ratio formula. */ - if (fabsf(f) < tolerance || f_ == 0.0f) { - break; - } + const float3 alpha_prime = make_float3( + bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA), + bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA), + bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA)); - r = r - f / f_; - if (r < 0.0f) { - r = 0.0f; - } + bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime)); } - return r; } -ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h) -{ - const float Rm = BURLEY_TRUNCATE * d; - const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d; - - *r = r_; - - /* h^2 + r^2 = Rm^2 */ - *h = safe_sqrtf(Rm * Rm - r_ * r_); -} - -/* None BSSRDF falloff - * - * Samples distributed over disk with no falloff, for reference. */ - -ccl_device float bssrdf_none_eval(const float radius, float r) -{ - const float Rm = radius; - return (r < Rm) ? 1.0f : 0.0f; -} - -ccl_device float bssrdf_none_pdf(const float radius, float r) -{ - /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */ - const float Rm = radius; - const float area = (M_PI_F * Rm * Rm); - - return bssrdf_none_eval(radius, r) / area; -} - -ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h) -{ - /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2 - * r = sqrt(xi)*Rm */ - const float Rm = radius; - const float r_ = sqrtf(xi) * Rm; - - *r = r_; - - /* h^2 + r^2 = Rm^2 */ - *h = safe_sqrtf(Rm * Rm - r_ * r_); -} - -/* Generic */ +/* Setup */ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight) { @@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight) return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL; } -ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type) +ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior) { int flag = 0; int bssrdf_channels = 3; @@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type) if (bssrdf_channels < 3) { /* Add diffuse BSDF if any radius too small. */ #ifdef __PRINCIPLED__ - if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) { + if (bssrdf->roughness != FLT_MAX) { float roughness = bssrdf->roughness; float3 N = bssrdf->N; @@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type) /* Setup BSSRDF if radius is large enough. */ if (bssrdf_channels > 0) { bssrdf->type = type; - bssrdf->channels = bssrdf_channels; - bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels; - bssrdf->texture_blur = saturate(bssrdf->texture_blur); - bssrdf->sharpness = saturate(bssrdf->sharpness); + bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels; - if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID || - type == CLOSURE_BSSRDF_RANDOM_WALK_ID || - type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) { - bssrdf_burley_setup(bssrdf); - } + bssrdf_setup_radius(bssrdf, type, ior); flag |= SD_BSSRDF; } @@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type) return flag; } -ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h) -{ - const Bssrdf *bssrdf = (const Bssrdf *)sc; - float radius; - - /* Sample color channel and reuse random number. Only a subset of channels - * may be used if their radius was too small to handle as BSSRDF. */ - xi *= bssrdf->channels; - - if (xi < 1.0f) { - radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x : - (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y : - bssrdf->radius.z; - } - else if (xi < 2.0f) { - xi -= 1.0f; - radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y : - bssrdf->radius.z; - } - else { - xi -= 2.0f; - radius = bssrdf->radius.z; - } - - /* Sample BSSRDF. */ - if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) { - bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h); - } - else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) { - bssrdf_gaussian_sample(radius, xi, r, h); - } - else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || - * bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */ - bssrdf_burley_sample(radius, xi, r, h); - } -} - -ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r) -{ - if (radius == 0.0f) { - return 0.0f; - } - else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) { - return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r); - } - else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) { - return bssrdf_gaussian_pdf(radius, r); - } - else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || - * bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ - return bssrdf_burley_pdf(radius, r); - } -} - -ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r) -{ - const Bssrdf *bssrdf = (const Bssrdf *)sc; - - return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r), - bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r), - bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r)); -} - -ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) -{ - const Bssrdf *bssrdf = (const Bssrdf *)sc; - float3 pdf = bssrdf_eval(sc, r); - - return (pdf.x + pdf.y + pdf.z) / bssrdf->channels; -} - CCL_NAMESPACE_END - -#endif /* __KERNEL_BSSRDF_H__ */ diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h index 911382e6865..a2519d97618 100644 --- a/intern/cycles/kernel/closure/emissive.h +++ b/intern/cycles/kernel/closure/emissive.h @@ -30,6 +30,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#pragma once + CCL_NAMESPACE_BEGIN /* BACKGROUND CLOSURE */ diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h index 1430f712701..69959a3f21b 100644 --- a/intern/cycles/kernel/closure/volume.h +++ b/intern/cycles/kernel/closure/volume.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __VOLUME_H__ -#define __VOLUME_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume) return SD_SCATTER; } -ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b) -{ - const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a; - const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b; - - return (volume_a->g == volume_b->g); -} - -ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, +ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc, const float3 I, float3 omega_in, float *pdf) { - const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc; - float g = volume->g; + float g = svc->g; /* note that I points towards the viewer */ if (fabsf(g) < 1e-3f) { @@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd return dir; } -ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, +ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc, float3 I, float3 dIdx, float3 dIdy, @@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 *domega_in_dy, float *pdf) { - const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc; - float g = volume->g; + float g = svc->g; /* note that I points towards the viewer and so is used negated */ *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf); @@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, /* VOLUME CLOSURE */ ccl_device float3 volume_phase_eval(const ShaderData *sd, - const ShaderClosure *sc, + const ShaderVolumeClosure *svc, float3 omega_in, float *pdf) { - kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID); - - return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); + return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf); } ccl_device int volume_phase_sample(const ShaderData *sd, - const ShaderClosure *sc, + const ShaderVolumeClosure *svc, float randu, float randv, float3 *eval, @@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd, differential3 *domega_in, float *pdf) { - int label; - - switch (sc->type) { - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, - sd->I, - sd->dI.dx, - sd->dI.dy, - randu, - randv, - eval, - omega_in, - &domega_in->dx, - &domega_in->dy, - pdf); - break; - default: - *eval = make_float3(0.0f, 0.0f, 0.0f); - label = LABEL_NONE; - break; + return volume_henyey_greenstein_sample(svc, + sd->I, + sd->dI.dx, + sd->dI.dy, + randu, + randv, + eval, + omega_in, + &domega_in->dx, + &domega_in->dy, + pdf); +} + +/* Volume sampling utilities. */ + +/* todo: this value could be tweaked or turned into a probability to avoid + * unnecessary work in volumes and subsurface scattering. */ +#define VOLUME_THROUGHPUT_EPSILON 1e-6f + +ccl_device float3 volume_color_transmittance(float3 sigma, float t) +{ + return exp3(-sigma * t); +} + +ccl_device float volume_channel_get(float3 value, int channel) +{ + return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z); +} + +ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf) +{ + /* Sample color channel proportional to throughput and single scattering + * albedo, to significantly reduce noise with many bounce, following: + * + * "Practical and Controllable Subsurface Scattering for Production Path + * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ + float3 weights = fabs(throughput * albedo); + float sum_weights = weights.x + weights.y + weights.z; + float3 weights_pdf; + + if (sum_weights > 0.0f) { + weights_pdf = weights / sum_weights; } + else { + weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f); + } + + *pdf = weights_pdf; - return label; + /* OpenCL does not support -> on float3, so don't use pdf->x. */ + if (rand < weights_pdf.x) { + return 0; + } + else if (rand < weights_pdf.x + weights_pdf.y) { + return 1; + } + else { + return 2; + } } CCL_NAMESPACE_END - -#endif diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h index 88f6a264a5a..bfd936c7bbd 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/device/cpu/compat.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_COMPAT_CPU_H__ -#define __KERNEL_COMPAT_CPU_H__ +#pragma once #define __KERNEL_CPU__ @@ -27,14 +26,6 @@ # pragma GCC diagnostic ignored "-Wuninitialized" #endif -/* Selective nodes compilation. */ -#ifndef __NODES_MAX_GROUP__ -# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX -#endif -#ifndef __NODES_FEATURES__ -# define __NODES_FEATURES__ NODE_FEATURE_ALL -#endif - #include "util/util_half.h" #include "util/util_math.h" #include "util/util_simd.h" @@ -43,15 +34,6 @@ #define ccl_addr_space -#define ccl_local_id(d) 0 -#define ccl_global_id(d) (kg->global_id[d]) - -#define ccl_local_size(d) 1 -#define ccl_global_size(d) (kg->global_size[d]) - -#define ccl_group_id(d) ccl_global_id(d) -#define ccl_num_groups(d) ccl_global_size(d) - /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ @@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN * simple arrays and after inlining fetch hopefully revert to being a simple * pointer lookup. */ template<typename T> struct texture { - ccl_always_inline const T &fetch(int index) + ccl_always_inline const T &fetch(int index) const { kernel_assert(index >= 0 && index < width); return data[index]; } -#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) - /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain - * compatibility with existing indices and data structures. - */ - ccl_always_inline avxf fetch_avxf(const int index) - { - kernel_assert(index >= 0 && (index + 1) < width); - ssef *ssef_data = (ssef *)data; - ssef *ssef_node_data = &ssef_data[index]; - return _mm256_loadu_ps((float *)ssef_node_data); - } -#endif - -#ifdef __KERNEL_SSE2__ - ccl_always_inline ssef fetch_ssef(int index) - { - kernel_assert(index >= 0 && index < width); - return ((ssef *)data)[index]; - } - - ccl_always_inline ssei fetch_ssei(int index) - { - kernel_assert(index >= 0 && index < width); - return ((ssei *)data)[index]; - } -#endif T *data; int width; @@ -110,15 +66,6 @@ template<typename T> struct texture { /* Macros to handle different memory storage on different devices */ -#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) -#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index)) -#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) -#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) -#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) -#define kernel_tex_array(tex) (kg->tex.data) - -#define kernel_data (kg->__data) - #ifdef __KERNEL_SSE2__ typedef vector3<sseb> sse3b; typedef vector3<ssef> sse3f; @@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f; #endif CCL_NAMESPACE_END - -#endif /* __KERNEL_COMPAT_CPU_H__ */ diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h new file mode 100644 index 00000000000..98b036e269d --- /dev/null +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +CCL_NAMESPACE_BEGIN + +/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in + * the kernel, to access constant data. These are all stored as "textures", but + * these are really just standard arrays. We can't use actually globals because + * multiple renders may be running inside the same process. */ + +#ifdef __OSL__ +struct OSLGlobals; +struct OSLThreadData; +struct OSLShadingSystem; +#endif + +typedef struct KernelGlobals { +#define KERNEL_TEX(type, name) texture<type> name; +#include "kernel/kernel_textures.h" + + KernelData __data; + +#ifdef __OSL__ + /* On the CPU, we also have the OSL globals here. Most data structures are shared + * with SVM, the difference is in the shaders and object/mesh attributes. */ + OSLGlobals *osl; + OSLShadingSystem *osl_ss; + OSLThreadData *osl_tdata; +#endif + + /* **** Run-time data **** */ + + ProfilingState profiler; +} KernelGlobals; + +/* Abstraction macros */ +#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) +#define kernel_tex_array(tex) (kg->tex.data) +#define kernel_data (kg->__data) + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h index 59b96c86c50..57e81ab186d 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/device/cpu/image.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_CPU_IMAGE_H__ -#define __KERNEL_CPU_IMAGE_H__ +#pragma once #ifdef WITH_NANOVDB # define NANOVDB_USE_INTRINSICS @@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator { #undef SET_CUBIC_SPLINE_WEIGHTS -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y) { const TextureInfo &info = kernel_tex_fetch(__texture_info, id); @@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl } } -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, +ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg, int id, float3 P, InterpolationType interp) @@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, } /* Namespace. */ CCL_NAMESPACE_END - -#endif // __KERNEL_CPU_IMAGE_H__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp index 8040bfb7b33..ac1cdf5fffe 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/device/cpu/kernel.cpp @@ -56,9 +56,9 @@ /* do nothing */ #endif -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h index b907c6a2bac..ae2a841835a 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -14,50 +14,49 @@ * limitations under the License. */ -#ifndef __KERNEL_H__ -#define __KERNEL_H__ +#pragma once /* CPU Kernel Interface */ -#include "kernel/kernel_types.h" #include "util/util_types.h" +#include "kernel/kernel_types.h" + CCL_NAMESPACE_BEGIN #define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z #define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) +struct IntegratorStateCPU; struct KernelGlobals; struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); -void *kernel_osl_memory(KernelGlobals *kg); -bool kernel_osl_use(KernelGlobals *kg); +void *kernel_osl_memory(const KernelGlobals *kg); +bool kernel_osl_use(const KernelGlobals *kg); void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size); void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size); #define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" #define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" #define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" #define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" #define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" #define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/kernel_cpu.h" +#include "kernel/device/cpu/kernel_arch.h" CCL_NAMESPACE_END - -#endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h new file mode 100644 index 00000000000..81f328c710b --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_arch.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +/* -------------------------------------------------------------------- + * Integrator. + */ + +#define KERNEL_INTEGRATOR_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + IntegratorStateCPU *state) + +#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + IntegratorStateCPU *state, \ + ccl_global float *render_buffer) + +#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + IntegratorStateCPU *state, \ + KernelWorkTile *tile, \ + ccl_global float *render_buffer) + +KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera); +KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake); +KERNEL_INTEGRATOR_FUNCTION(intersect_closest); +KERNEL_INTEGRATOR_FUNCTION(intersect_shadow); +KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface); +KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface); +KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume); +KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel); + +#undef KERNEL_INTEGRATOR_FUNCTION +#undef KERNEL_INTEGRATOR_INIT_FUNCTION +#undef KERNEL_INTEGRATOR_SHADE_FUNCTION + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg, + const KernelShaderEvalInput *input, + float4 *output, + const int offset); +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg, + const KernelShaderEvalInput *input, + float4 *output, + const int offset); + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( + const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride); + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride); +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride); + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int pixel_index); + +/* -------------------------------------------------------------------- + * Bake. + */ +/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */ + +void KERNEL_FUNCTION_FULL_NAME(bake)( + const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h new file mode 100644 index 00000000000..1432abfd330 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -0,0 +1,235 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#pragma once + +// clang-format off +#include "kernel/device/cpu/compat.h" + +#ifndef KERNEL_STUB +# include "kernel/device/cpu/globals.h" +# include "kernel/device/cpu/image.h" + +# include "kernel/integrator/integrator_state.h" +# include "kernel/integrator/integrator_state_flow.h" +# include "kernel/integrator/integrator_state_util.h" + +# include "kernel/integrator/integrator_init_from_camera.h" +# include "kernel/integrator/integrator_init_from_bake.h" +# include "kernel/integrator/integrator_intersect_closest.h" +# include "kernel/integrator/integrator_intersect_shadow.h" +# include "kernel/integrator/integrator_intersect_subsurface.h" +# include "kernel/integrator/integrator_intersect_volume_stack.h" +# include "kernel/integrator/integrator_shade_background.h" +# include "kernel/integrator/integrator_shade_light.h" +# include "kernel/integrator/integrator_shade_shadow.h" +# include "kernel/integrator/integrator_shade_surface.h" +# include "kernel/integrator/integrator_shade_volume.h" +# include "kernel/integrator/integrator_megakernel.h" + +# include "kernel/kernel_film.h" +# include "kernel/kernel_adaptive_sampling.h" +# include "kernel/kernel_bake.h" +# include "kernel/kernel_id_passes.h" + +#else +# define STUB_ASSERT(arch, name) \ + assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif /* KERNEL_STUB */ +// clang-format on + +CCL_NAMESPACE_BEGIN + +/* -------------------------------------------------------------------- + * Integrator. + */ + +#ifdef KERNEL_STUB +# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0) +#else +# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__) +#endif + +#define DEFINE_INTEGRATOR_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \ + IntegratorStateCPU *state) \ + { \ + KERNEL_INVOKE(name, kg, state); \ + } + +#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \ + const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ + { \ + KERNEL_INVOKE(name, kg, state, render_buffer); \ + } + +/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so + * that it does not contain unused fields. */ +#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \ + IntegratorStateCPU *state, \ + KernelWorkTile *tile, \ + ccl_global float *render_buffer) \ + { \ + return KERNEL_INVOKE( \ + name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \ + } + +DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera) +DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake) +DEFINE_INTEGRATOR_KERNEL(intersect_closest) +DEFINE_INTEGRATOR_KERNEL(intersect_shadow) +DEFINE_INTEGRATOR_KERNEL(intersect_subsurface) +DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface) +DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume) +DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel) + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg, + const KernelShaderEvalInput *input, + float4 *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_displace); +#else + kernel_displace_evaluate(kg, input, output, offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg, + const KernelShaderEvalInput *input, + float4 *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_background); +#else + kernel_background_evaluate(kg, input, output, offset); +#endif +} + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( + const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check); + return false; +#else + return kernel_adaptive_sampling_convergence_check( + kg, render_buffer, x, y, threshold, reset, offset, stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x); +#else + kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y); +#else + kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); +#endif +} + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg, + ccl_global float *render_buffer, + int pixel_index) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess); +#else + kernel_cryptomatte_post(kg, render_buffer, pixel_index); +#endif +} + +/* -------------------------------------------------------------------- + * Bake. + */ +/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */ + +void KERNEL_FUNCTION_FULL_NAME(bake)( + const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride) +{ +#if 0 +# ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, bake); +# else +# ifdef __BAKING__ + kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride); +# endif +# endif /* KERNEL_STUB */ +#endif +} + +#undef KERNEL_INVOKE +#undef DEFINE_INTEGRATOR_KERNEL +#undef DEFINE_INTEGRATOR_SHADE_KERNEL +#undef DEFINE_INTEGRATOR_INIT_KERNEL + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp index 5f6b6800363..220768036ab 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp @@ -34,6 +34,6 @@ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp index 97e8fc25140..90c05113cbe 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp @@ -35,6 +35,6 @@ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp index 26d7fd4de48..fb85ef5b0d0 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp @@ -29,6 +29,6 @@ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp index 3f259aa4480..87baf04258a 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp @@ -31,6 +31,6 @@ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp index 68bae8c07c6..bb421d58815 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp @@ -32,6 +32,6 @@ # endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ -#include "kernel/kernel.h" +#include "kernel/device/cpu/kernel.h" #define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" +#include "kernel/device/cpu/kernel_arch_impl.h" diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h index ea3b78b7cef..665da43e1a1 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -14,20 +14,15 @@ * limitations under the License. */ -#ifndef __KERNEL_COMPAT_CUDA_H__ -#define __KERNEL_COMPAT_CUDA_H__ +#pragma once #define __KERNEL_GPU__ #define __KERNEL_CUDA__ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END -/* Selective nodes compilation. */ -#ifndef __NODES_MAX_GROUP__ -# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX -#endif -#ifndef __NODES_FEATURES__ -# define __NODES_FEATURES__ NODE_FEATURE_ALL +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH #endif /* Manual definitions so we can compile without CUDA toolkit. */ @@ -38,8 +33,6 @@ typedef unsigned long long uint64_t; #else # include <stdint.h> #endif -typedef unsigned short half; -typedef unsigned long long CUtexObject; #ifdef CYCLES_CUBIN_CC # define FLT_MIN 1.175494350822287507969e-38f @@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject; # define FLT_EPSILON 1.192092896e-07F #endif -__device__ half __float2half(const float f) -{ - half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); - return val; -} - -/* Qualifier wrappers for different names on different devices */ +/* Qualifiers */ #define ccl_device __device__ __inline__ #if __CUDA_ARCH__ < 500 @@ -68,104 +54,61 @@ __device__ half __float2half(const float f) #define ccl_device_noinline_cpu ccl_device #define ccl_global #define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ #define ccl_constant const -#define ccl_local __shared__ -#define ccl_local_param +#define ccl_gpu_shared __shared__ #define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_loop_no_unroll -/* TODO(sergey): In theory we might use references with CUDA, however - * performance impact yet to be investigated. - */ -#define ccl_ref #define ccl_align(n) __align__(n) #define ccl_optional_struct_init -#define ATTR_FALLTHROUGH - -#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH) - /* No assert supported for CUDA */ #define kernel_assert(cond) -/* Types */ +/* GPU thread, block, grid size and index */ -#include "util/util_half.h" -#include "util/util_types.h" +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) -/* Work item functions */ +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) -ccl_device_inline uint ccl_local_id(uint d) -{ - switch (d) { - case 0: - return threadIdx.x; - case 1: - return threadIdx.y; - case 2: - return threadIdx.z; - default: - return 0; - } -} +/* GPU warp synchronizaton */ -#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) +#define ccl_gpu_popc(x) __popc(x) -ccl_device_inline uint ccl_local_size(uint d) -{ - switch (d) { - case 0: - return blockDim.x; - case 1: - return blockDim.y; - case 2: - return blockDim.z; - default: - return 0; - } -} +/* GPU texture objects */ -#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) +typedef unsigned long long CUtexObject; +typedef CUtexObject ccl_gpu_tex_object; -ccl_device_inline uint ccl_group_id(uint d) +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) { - switch (d) { - case 0: - return blockIdx.x; - case 1: - return blockIdx.y; - case 2: - return blockIdx.z; - default: - return 0; - } + return tex2D<T>(texobj, x, y); } -ccl_device_inline uint ccl_num_groups(uint d) +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) { - switch (d) { - case 0: - return gridDim.x; - case 1: - return gridDim.y; - case 2: - return gridDim.z; - default: - return 0; - } + return tex3D<T>(texobj, x, y, z); } -/* Textures */ - -/* Use arrays for regular data. */ -#define kernel_tex_fetch(t, index) t[(index)] -#define kernel_tex_array(t) (t) - -#define kernel_data __data - /* Use fast math functions */ #define cosf(x) __cosf(((float)(x))) @@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d) #define logf(x) __logf(((float)(x))) #define expf(x) __expf(((float)(x))) -#endif /* __KERNEL_COMPAT_CUDA_H__ */ +/* Half */ + +typedef unsigned short half; + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h new file mode 100644 index 00000000000..46196dcdb51 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/config.h @@ -0,0 +1,114 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from CUDA occupancy calculator. + * + * Terminology + * - CUDA GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* 3.0 and 3.5 */ +#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 32768 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +# define GPU_KERNEL_MAX_REGISTERS 63 + +/* 5.x, 6.x */ +#elif __CUDA_ARCH__ <= 699 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 256 +/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of + * registers */ +# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600 +# define GPU_KERNEL_MAX_REGISTERS 64 +# else +# define GPU_KERNEL_MAX_REGISTERS 48 +# endif + +/* 7.x, 8.x */ +#elif __CUDA_ARCH__ <= 899 +# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32 +# define GPU_BLOCK_MAX_THREADS 1024 +# define GPU_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define GPU_KERNEL_BLOCK_NUM_THREADS 512 +# define GPU_KERNEL_MAX_REGISTERS 96 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h new file mode 100644 index 00000000000..169047175f5 --- /dev/null +++ b/intern/cycles/kernel/device/cuda/globals.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobals { + int unused[1]; +}; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/device/cuda/kernel.cu index 84938b889e5..e26fe243642 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl +++ b/intern/cycles/kernel/device/cuda/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2013 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,15 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_indirect_subsurface.h" +/* CUDA kernel entry points */ -#define KERNEL_NAME indirect_subsurface -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME +#ifdef __CUDA_ARCH__ +# include "kernel/device/cuda/compat.h" +# include "kernel/device/cuda/config.h" +# include "kernel/device/cuda/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h index 132653fa7ca..b015c78a8f5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -14,6 +14,10 @@ * limitations under the License. */ +#pragma once + +CCL_NAMESPACE_BEGIN + #ifdef WITH_NANOVDB # define NDEBUG /* Disable "assert" in device code */ # define NANOVDB_USE_INTRINSICS @@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a) /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ template<typename T> -ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) +ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) { - CUtexObject tex = (CUtexObject)info.data; + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; x = (x * info.width) - 0.5f; y = (y * info.height) - 0.5f; @@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f float y0 = (py + cubic_h0(fy) + 0.5f) / info.height; float y1 = (py + cubic_h1(fy) + 0.5f) / info.height; - return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) + - cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1)); + return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) + + g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) + + cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) + + g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1)); } /* Fast tricubic texture lookup using 8 trilinear lookups. */ template<typename T> -ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) +ccl_device_noinline T +kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) { - CUtexObject tex = (CUtexObject)info.data; + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; x = (x * info.width) - 0.5f; y = (y * info.height) - 0.5f; @@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth; float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth; - return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) + - g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) + - g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) + - g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1))); + return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) + + g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) + + g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) + + g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) + + g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1))); } #ifdef WITH_NANOVDB @@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl } template<typename T> -ccl_device_inline T kernel_tex_image_interp_nanovdb( +ccl_device_noinline T kernel_tex_image_interp_nanovdb( const TextureInfo &info, float x, float y, float z, uint interpolation) { using namespace nanovdb; @@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb( } #endif -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y) { const TextureInfo &info = kernel_tex_fetch(__texture_info, id); @@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl return kernel_tex_image_interp_bicubic<float4>(info, x, y); } else { - CUtexObject tex = (CUtexObject)info.data; - return tex2D<float4>(tex, x, y); + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + return ccl_gpu_tex_object_read_2D<float4>(tex, x, y); } } /* float, byte and half */ @@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl f = kernel_tex_image_interp_bicubic<float>(info, x, y); } else { - CUtexObject tex = (CUtexObject)info.data; - f = tex2D<float>(tex, x, y); + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + f = ccl_gpu_tex_object_read_2D<float>(tex, x, y); } return make_float4(f, f, f, 1.0f); } } -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, +ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg, int id, float3 P, InterpolationType interp) @@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, return kernel_tex_image_interp_tricubic<float4>(info, x, y, z); } else { - CUtexObject tex = (CUtexObject)info.data; - return tex3D<float4>(tex, x, y, z); + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z); } } else { @@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, f = kernel_tex_image_interp_tricubic<float>(info, x, y, z); } else { - CUtexObject tex = (CUtexObject)info.data; - f = tex3D<float>(tex, x, y, z); + ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; + f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z); } return make_float4(f, f, f, 1.0f); } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h new file mode 100644 index 00000000000..7b79c0aedfa --- /dev/null +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -0,0 +1,843 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Common GPU kernels. */ + +#include "kernel/device/gpu/parallel_active_index.h" +#include "kernel/device/gpu/parallel_prefix_sum.h" +#include "kernel/device/gpu/parallel_sorted_index.h" + +#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/integrator_state_flow.h" +#include "kernel/integrator/integrator_state_util.h" + +#include "kernel/integrator/integrator_init_from_bake.h" +#include "kernel/integrator/integrator_init_from_camera.h" +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_intersect_shadow.h" +#include "kernel/integrator/integrator_intersect_subsurface.h" +#include "kernel/integrator/integrator_intersect_volume_stack.h" +#include "kernel/integrator/integrator_shade_background.h" +#include "kernel/integrator/integrator_shade_light.h" +#include "kernel/integrator/integrator_shade_shadow.h" +#include "kernel/integrator/integrator_shade_surface.h" +#include "kernel/integrator/integrator_shade_volume.h" + +#include "kernel/kernel_adaptive_sampling.h" +#include "kernel/kernel_bake.h" +#include "kernel/kernel_film.h" +#include "kernel/kernel_work_stealing.h" + +/* -------------------------------------------------------------------- + * Integrator. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_reset(int num_states) +{ + const int state = ccl_gpu_global_id_x(); + + if (state < num_states) { + INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles, + const int num_tiles, + float *render_buffer, + const int max_tile_work_size) +{ + const int work_index = ccl_gpu_global_id_x(); + + if (work_index >= max_tile_work_size * num_tiles) { + return; + } + + const int tile_index = work_index / max_tile_work_size; + const int tile_work_index = work_index - tile_index * max_tile_work_size; + + const KernelWorkTile *tile = &tiles[tile_index]; + + if (tile_work_index >= tile->work_size) { + return; + } + + const int state = tile->path_index_offset + tile_work_index; + + uint x, y, sample; + get_work_pixel(tile, tile_work_index, &x, &y, &sample); + + integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles, + const int num_tiles, + float *render_buffer, + const int max_tile_work_size) +{ + const int work_index = ccl_gpu_global_id_x(); + + if (work_index >= max_tile_work_size * num_tiles) { + return; + } + + const int tile_index = work_index / max_tile_work_size; + const int tile_work_index = work_index - tile_index * max_tile_work_size; + + const KernelWorkTile *tile = &tiles[tile_index]; + + if (tile_work_index >= tile->work_size) { + return; + } + + const int state = tile->path_index_offset + tile_work_index; + + uint x, y, sample; + get_work_pixel(tile, tile_work_index, &x, &y, &sample); + + integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_closest(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_shadow(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_subsurface(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_intersect_volume_stack(NULL, state); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_background(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_background(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_light(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_light(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_shadow(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_shadow(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_surface(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_surface(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_surface_raytrace(NULL, state, render_buffer); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shade_volume(const int *path_index_array, + float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + integrator_shade_volume(NULL, state, render_buffer); + } +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_queued_paths_array(int num_states, + int *indices, + int *num_indices, + int kernel) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [kernel](const int state) { + return (INTEGRATOR_STATE(path, queued_kernel) == kernel); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_queued_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int kernel) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [kernel](const int state) { + return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [](const int state) { + return (INTEGRATOR_STATE(path, queued_kernel) != 0) || + (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(path, queued_kernel) == 0) && + (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_sorted_paths_array( + int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel) +{ + gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) { + return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ? + INTEGRATOR_STATE(path, shader_sort_key) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && + ((INTEGRATOR_STATE(path, queued_kernel) != 0) || + (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0)); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_state_move(to_state, from_state); + } +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) + kernel_gpu_prefix_sum(int *values, int num_values) +{ + gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values); +} + +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer, + int sx, + int sy, + int sw, + int sh, + float threshold, + bool reset, + int offset, + int stride, + uint *num_active_pixels) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / sw; + const int x = work_index - y * sw; + + bool converged = true; + + if (x < sw && y < sh) { + converged = kernel_adaptive_sampling_convergence_check( + nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride); + } + + /* NOTE: All threads specified in the mask must execute the intrinsic. */ + const uint num_active_pixels_mask = ccl_gpu_ballot(!converged); + const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; + if (lane_id == 0) { + atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask)); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_filter_x( + float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) +{ + const int y = ccl_gpu_global_id_x(); + + if (y < sh) { + kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride); + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_adaptive_sampling_filter_y( + float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) +{ + const int x = ccl_gpu_global_id_x(); + + if (x < sw) { + kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride); + } +} + +/* -------------------------------------------------------------------- + * Cryptomatte. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels) +{ + const int pixel_index = ccl_gpu_global_id_x(); + + if (pixel_index < num_pixels) { + kernel_cryptomatte_post(nullptr, render_buffer, pixel_index); + } +} + +/* -------------------------------------------------------------------- + * Film. + */ + +/* Common implementation for float destination. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert, + float *pixels, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int dst_offset, + int dst_stride, + const Processor &processor) +{ + const int render_pixel_index = ccl_gpu_global_id_x(); + if (render_pixel_index >= num_pixels) { + return; + } + + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride; + ccl_global const float *buffer = render_buffer + render_buffer_offset; + ccl_global float *pixel = pixels + + (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride; + + processor(kfilm_convert, buffer, pixel); +} + +/* Common implementation for half4 destination and 4-channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + const int render_pixel_index = ccl_gpu_global_id_x(); + if (render_pixel_index >= num_pixels) { + return; + } + + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride; + ccl_global const float *buffer = render_buffer + render_buffer_offset; + + float pixel[4]; + processor(kfilm_convert, buffer, pixel); + + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); + + const int x = render_pixel_index % width; + const int y = render_pixel_index / width; + + ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; + float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); +} + +/* Common implementation for half4 destination and 3-channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + kernel_gpu_film_convert_half_rgba_common_rgba( + kfilm_convert, + rgba, + render_buffer, + num_pixels, + width, + offset, + stride, + rgba_offset, + rgba_stride, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + processor(kfilm_convert, buffer, pixel_rgba); + pixel_rgba[3] = 1.0f; + }); +} + +/* Common implementation for half4 destination and single channel input pass. */ +template<typename Processor> +ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value( + const KernelFilmConvert *kfilm_convert, + uchar4 *rgba, + float *render_buffer, + int num_pixels, + int width, + int offset, + int stride, + int rgba_offset, + int rgba_stride, + const Processor &processor) +{ + kernel_gpu_film_convert_half_rgba_common_rgba( + kfilm_convert, + rgba, + render_buffer, + num_pixels, + width, + offset, + stride, + rgba_offset, + rgba_stride, + [&processor](const KernelFilmConvert *kfilm_convert, + ccl_global const float *buffer, + float *pixel_rgba) { + float value; + processor(kfilm_convert, buffer, &value); + + pixel_rgba[0] = value; + pixel_rgba[1] = value; + pixel_rgba[2] = value; + pixel_rgba[3] = 1.0f; + }); +} + +#define KERNEL_FILM_CONVERT_PROC(name) \ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name + +#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \ + KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \ + (const KernelFilmConvert kfilm_convert, \ + float *pixels, \ + float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ + { \ + kernel_gpu_film_convert_common(&kfilm_convert, \ + pixels, \ + render_buffer, \ + num_pixels, \ + width, \ + offset, \ + stride, \ + rgba_offset, \ + rgba_stride, \ + film_get_pass_pixel_##variant); \ + } \ + KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \ + (const KernelFilmConvert kfilm_convert, \ + uchar4 *rgba, \ + float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ + { \ + kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \ + rgba, \ + render_buffer, \ + num_pixels, \ + width, \ + offset, \ + stride, \ + rgba_offset, \ + rgba_stride, \ + film_get_pass_pixel_##variant); \ + } + +KERNEL_FILM_CONVERT_DEFINE(depth, value) +KERNEL_FILM_CONVERT_DEFINE(mist, value) +KERNEL_FILM_CONVERT_DEFINE(sample_count, value) +KERNEL_FILM_CONVERT_DEFINE(float, value) + +KERNEL_FILM_CONVERT_DEFINE(light_path, rgb) +KERNEL_FILM_CONVERT_DEFINE(float3, rgb) + +KERNEL_FILM_CONVERT_DEFINE(motion, rgba) +KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba) +KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba) +KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba) +KERNEL_FILM_CONVERT_DEFINE(combined, rgba) +KERNEL_FILM_CONVERT_DEFINE(float4, rgba) + +#undef KERNEL_FILM_CONVERT_DEFINE +#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE +#undef KERNEL_FILM_CONVERT_PROC + +/* -------------------------------------------------------------------- + * Shader evaluation. + */ + +/* Displacement */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input, + float4 *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + kernel_displace_evaluate(NULL, input, output, offset + i); + } +} + +/* Background Shader Evaluation */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_shader_eval_background(KernelShaderEvalInput *input, + float4 *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + kernel_background_evaluate(NULL, input, output, offset + i); + } +} + +/* -------------------------------------------------------------------- + * Denoising. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_color_preprocess(float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int pass_denoised) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; + float *buffer = render_buffer + render_pixel_index * pass_stride; + + float *color_out = buffer + pass_denoised; + color_out[0] = clamp(color_out[0], 0.0f, 10000.0f); + color_out[1] = clamp(color_out[1], 0.0f, 10000.0f); + color_out[2] = clamp(color_out[2], 0.0f, 10000.0f); +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_guiding_preprocess(float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int guiding_pass_normal, + const float *render_buffer, + int render_offset, + int render_stride, + int render_pass_stride, + int render_pass_sample_count, + int render_pass_denoising_albedo, + int render_pass_denoising_normal, + int full_x, + int full_y, + int width, + int height, + int num_samples) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t guiding_pixel_index = x + y * width; + float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + + const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride; + const float *buffer = render_buffer + render_pixel_index * render_pass_stride; + + float pixel_scale; + if (render_pass_sample_count == PASS_UNUSED) { + pixel_scale = 1.0f / num_samples; + } + else { + pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]); + } + + /* Albedo pass. */ + if (guiding_pass_albedo != PASS_UNUSED) { + kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); + + const float *aledo_in = buffer + render_pass_denoising_albedo; + float *albedo_out = guiding_pixel + guiding_pass_albedo; + + albedo_out[0] = aledo_in[0] * pixel_scale; + albedo_out[1] = aledo_in[1] * pixel_scale; + albedo_out[2] = aledo_in[2] * pixel_scale; + } + + /* Normal pass. */ + if (render_pass_denoising_normal != PASS_UNUSED) { + kernel_assert(render_pass_denoising_normal != PASS_UNUSED); + + const float *normal_in = buffer + render_pass_denoising_normal; + float *normal_out = guiding_pixel + guiding_pass_normal; + + normal_out[0] = normal_in[0] * pixel_scale; + normal_out[1] = normal_in[1] * pixel_scale; + normal_out[2] = normal_in[2] * pixel_scale; + } +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int width, + int height) +{ + kernel_assert(guiding_pass_albedo != PASS_UNUSED); + + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t guiding_pixel_index = x + y * width; + float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + + float *albedo_out = guiding_pixel + guiding_pass_albedo; + + albedo_out[0] = 0.5f; + albedo_out[1] = 0.5f; + albedo_out[2] = 0.5f; +} + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_filter_color_postprocess(float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int num_samples, + int pass_noisy, + int pass_denoised, + int pass_sample_count, + int num_components, + bool use_compositing) +{ + const int work_index = ccl_gpu_global_id_x(); + const int y = work_index / width; + const int x = work_index - y * width; + + if (x >= width || y >= height) { + return; + } + + const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; + float *buffer = render_buffer + render_pixel_index * pass_stride; + + float pixel_scale; + if (pass_sample_count == PASS_UNUSED) { + pixel_scale = num_samples; + } + else { + pixel_scale = __float_as_uint(buffer[pass_sample_count]); + } + + float *denoised_pixel = buffer + pass_denoised; + + denoised_pixel[0] *= pixel_scale; + denoised_pixel[1] *= pixel_scale; + denoised_pixel[2] *= pixel_scale; + + if (num_components == 3) { + /* Pass without alpha channel. */ + } + else if (!use_compositing) { + /* Currently compositing passes are either 3-component (derived by dividing light passes) + * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it + * simplifies logic and avoids extra memory allocation. */ + const float *noisy_pixel = buffer + pass_noisy; + denoised_pixel[3] = noisy_pixel[3]; + } + else { + /* Assigning to zero since this is a default alpha value for 3-component passes, and it + * is an opaque pixel for 4 component passes. */ + + denoised_pixel[3] = 0; + } +} + +/* -------------------------------------------------------------------- + * Shadow catcher. + */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states, + uint *num_possible_splits) +{ + const int state = ccl_gpu_global_id_x(); + + bool can_split = false; + + if (state < num_states) { + can_split = kernel_shadow_catcher_path_can_split(nullptr, state); + } + + /* NOTE: All threads specified in the mask must execute the intrinsic. */ + const uint can_split_mask = ccl_gpu_ballot(can_split); + const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; + if (lane_id == 0) { + atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask)); + } +} diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h new file mode 100644 index 00000000000..85500bf4d07 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Given an array of states, build an array of indices for which the states + * are active. + * + * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */ + +#include "util/util_atomic.h" + +#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 + +template<uint blocksize, typename IsActiveOp> +__device__ void gpu_parallel_active_index_array(const uint num_states, + int *indices, + int *num_indices, + IsActiveOp is_active_op) +{ + extern ccl_gpu_shared int warp_offset[]; + + const uint thread_index = ccl_gpu_thread_idx_x; + const uint thread_warp = thread_index % ccl_gpu_warp_size; + + const uint warp_index = thread_index / ccl_gpu_warp_size; + const uint num_warps = blocksize / ccl_gpu_warp_size; + + /* Test if state corresponding to this thread is active. */ + const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; + + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); + const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; + } + + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } + + ccl_gpu_syncthreads(); + + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h new file mode 100644 index 00000000000..f609520b8b4 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -0,0 +1,46 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Parallel prefix sum. + * + * TODO: actually make this work in parallel. + * + * This is used for an array the size of the number of shaders in the scene + * which is not usually huge, so might not be a significant bottleneck. */ + +#include "util/util_atomic.h" + +#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 + +template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) +{ + if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { + return; + } + + int offset = 0; + for (int i = 0; i < num_values; i++) { + const int new_offset = offset + values[i]; + values[i] = offset; + offset = new_offset; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h new file mode 100644 index 00000000000..65b1990dbb8 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Parallel sum of array input_data with size n into output_sum. + * + * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris. + * + * This version adds multiple elements per thread sequentially. This reduces + * the overall cost of the algorithm while keeping the work complexity O(n) and + * the step complexity O(log n). (Brent's Theorem optimization) */ + +#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 + +template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp> +__device__ void gpu_parallel_sum( + const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert) +{ + extern ccl_gpu_shared OutputT shared_data[]; + + const uint tid = ccl_gpu_thread_idx_x; + const uint gridsize = blocksize * ccl_gpu_grid_dim_x(); + + OutputT sum = zero; + for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) { + sum += convert(input_data[i]); + } + shared_data[tid] = sum; + + ccl_gpu_syncthreads(); + + if (blocksize >= 512 && tid < 256) { + shared_data[tid] = sum = sum + shared_data[tid + 256]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 256 && tid < 128) { + shared_data[tid] = sum = sum + shared_data[tid + 128]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 128 && tid < 64) { + shared_data[tid] = sum = sum + shared_data[tid + 64]; + } + + ccl_gpu_syncthreads(); + + if (blocksize >= 64 && tid < 32) { + shared_data[tid] = sum = sum + shared_data[tid + 32]; + } + + ccl_gpu_syncthreads(); + + if (tid < 32) { + for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) { + sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset); + } + } + + if (tid == 0) { + output_sum[ccl_gpu_block_idx_x] = sum; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h new file mode 100644 index 00000000000..99b35468517 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -0,0 +1,49 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Given an array of states, build an array of indices for which the states + * are active and sorted by a given key. The prefix sum of the number of active + * states per key must have already been computed. + * + * TODO: there may be ways to optimize this to avoid this many atomic ops? */ + +#include "util/util_atomic.h" + +#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) + +template<uint blocksize, typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint num_states, + int *indices, + int *num_indices, + int *key_prefix_sum, + GetKeyOp get_key_op) +{ + const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; + const int key = (state_index < num_states) ? get_key_op(state_index) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; + + if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) { + const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1); + indices[index] = state_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h index 064c99ca100..4e255a135c6 100644 --- a/intern/cycles/kernel/kernel_compat_optix.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -15,14 +15,13 @@ * limitations under the License. */ -#ifndef __KERNEL_COMPAT_OPTIX_H__ -#define __KERNEL_COMPAT_OPTIX_H__ +#pragma once #define OPTIX_DONT_INCLUDE_CUDA #include <optix.h> #define __KERNEL_GPU__ -#define __KERNEL_CUDA__ // OptiX kernels are implicitly CUDA kernels too +#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */ #define __KERNEL_OPTIX__ #define CCL_NAMESPACE_BEGIN #define CCL_NAMESPACE_END @@ -31,14 +30,14 @@ # define ATTR_FALLTHROUGH #endif +/* Manual definitions so we can compile without CUDA toolkit. */ + #ifdef __CUDACC_RTC__ typedef unsigned int uint32_t; typedef unsigned long long uint64_t; #else # include <stdint.h> #endif -typedef unsigned short half; -typedef unsigned long long CUtexObject; #ifdef CYCLES_CUBIN_CC # define FLT_MIN 1.175494350822287507969e-38f @@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject; # define FLT_EPSILON 1.192092896e-07F #endif -__device__ half __float2half(const float f) -{ - half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); - return val; -} - -/* Selective nodes compilation. */ -#ifndef __NODES_MAX_GROUP__ -# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX -#endif -#ifndef __NODES_FEATURES__ -# define __NODES_FEATURES__ NODE_FEATURE_ALL -#endif - #define ccl_device \ __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything #define ccl_device_inline ccl_device @@ -69,29 +53,75 @@ __device__ half __float2half(const float f) #define ccl_device_noinline_cpu ccl_device #define ccl_global #define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ #define ccl_constant const -#define ccl_local -#define ccl_local_param +#define ccl_gpu_shared __shared__ #define ccl_private #define ccl_may_alias #define ccl_addr_space -#define ccl_loop_no_unroll #define ccl_restrict __restrict__ -#define ccl_ref +#define ccl_loop_no_unroll #define ccl_align(n) __align__(n) -// Zero initialize structs to help the compiler figure out scoping +/* Zero initialize structs to help the compiler figure out scoping */ #define ccl_optional_struct_init = {} -#define kernel_data __params.data // See kernel_globals.h -#define kernel_tex_array(t) __params.t -#define kernel_tex_fetch(t, index) __params.t[(index)] +/* No assert supported for CUDA */ #define kernel_assert(cond) +/* GPU thread, block, grid size and index */ + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronizaton */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ + +typedef unsigned long long CUtexObject; +typedef CUtexObject ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Half */ + +typedef unsigned short half; + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} + /* Types */ #include "util/util_half.h" #include "util/util_types.h" - -#endif /* __KERNEL_COMPAT_OPTIX_H__ */ diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h new file mode 100644 index 00000000000..7d898ed5d91 --- /dev/null +++ b/intern/cycles/kernel/device/optix/globals.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobals { + int unused[1]; +}; + +/* Launch parameters */ +struct KernelParamsOptiX { + /* Kernel arguments */ + const int *path_index_array; + float *render_buffer; + + /* Global scene data and textures */ + KernelData data; +#define KERNEL_TEX(type, name) const type *name; +#include "kernel/kernel_textures.h" + + /* Integrator state */ + IntegratorStateGPU __integrator_state; +}; + +#ifdef __NVCC__ +extern "C" static __constant__ KernelParamsOptiX __params; +#endif + +/* Abstraction macros */ +#define kernel_data __params.data +#define kernel_tex_array(t) __params.t +#define kernel_tex_fetch(t, index) __params.t[(index)] +#define kernel_integrator_state __params.__integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu index 7f609eab474..c1e36febfc0 100644 --- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -16,14 +16,20 @@ */ // clang-format off -#include "kernel/kernel_compat_optix.h" -#include "util/util_atomic.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "../cuda/kernel_cuda_image.h" // Texture lookup uses normal CUDA intrinsics - -#include "kernel/kernel_path.h" -#include "kernel/kernel_bake.h" +#include "kernel/device/optix/compat.h" +#include "kernel/device/optix/globals.h" + +#include "kernel/device/gpu/image.h" // Texture lookup uses normal CUDA intrinsics + +#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/integrator_state_flow.h" +#include "kernel/integrator/integrator_state_util.h" + +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_intersect_shadow.h" +#include "kernel/integrator/integrator_intersect_subsurface.h" +#include "kernel/integrator/integrator_intersect_volume_stack.h" + // clang-format on template<typename T> ccl_device_forceinline T *get_payload_ptr_0() @@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id() return OBJECT_NONE; } -extern "C" __global__ void __raygen__kernel_optix_path_trace() +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() { - KernelGlobals kg; // Allocate stack storage for common data - - const uint3 launch_index = optixGetLaunchIndex(); - // Keep threads for same pixel together to improve occupancy of warps - uint pixel_offset = launch_index.x / __params.tile.num_samples; - uint sample_offset = launch_index.x % __params.tile.num_samples; - - kernel_path_trace(&kg, - __params.tile.buffer, - __params.tile.start_sample + sample_offset, - __params.tile.x + pixel_offset, - __params.tile.y + launch_index.y, - __params.tile.offset, - __params.tile.stride); + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_closest(nullptr, path_index); } -#ifdef __BAKING__ -extern "C" __global__ void __raygen__kernel_optix_bake() +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow() { - KernelGlobals kg; - const ShaderParams &p = __params.shader; - kernel_bake_evaluate(&kg, - p.input, - p.output, - (ShaderEvalType)p.type, - p.filter, - p.sx + optixGetLaunchIndex().x, - p.offset, - p.sample); + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_shadow(nullptr, path_index); } -#endif -extern "C" __global__ void __raygen__kernel_optix_displace() +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface() { - KernelGlobals kg; - const ShaderParams &p = __params.shader; - kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x); + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_subsurface(nullptr, path_index); } -extern "C" __global__ void __raygen__kernel_optix_background() +extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack() { - KernelGlobals kg; - const ShaderParams &p = __params.shader; - kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x); + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_intersect_volume_stack(nullptr, path_index); } extern "C" __global__ void __miss__kernel_optix_miss() @@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() { #ifdef __SHADOW_RECORD_ALL__ + bool ignore_intersection = false; + const uint prim = optixGetPrimitiveIndex(); # ifdef __VISIBILITY_FLAG__ const uint visibility = optixGetPayload_4(); if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { - return optixIgnoreIntersection(); + ignore_intersection = true; } # endif - // Offset into array with num_hits - Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2(); - isect->t = optixGetRayTmax(); - isect->prim = prim; - isect->object = get_object_id(); - isect->type = kernel_tex_fetch(__prim_type, prim); - + float u = 0.0f, v = 0.0f; if (optixIsTriangleHit()) { const float2 barycentrics = optixGetTriangleBarycentrics(); - isect->u = 1.0f - barycentrics.y - barycentrics.x; - isect->v = barycentrics.x; + u = 1.0f - barycentrics.y - barycentrics.x; + v = barycentrics.x; } # ifdef __HAIR__ else { - const float u = __uint_as_float(optixGetAttribute_0()); - isect->u = u; - isect->v = __uint_as_float(optixGetAttribute_1()); + u = __uint_as_float(optixGetAttribute_0()); + v = __uint_as_float(optixGetAttribute_1()); // Filter out curve endcaps if (u == 0.0f || u == 1.0f) { - return optixIgnoreIntersection(); + ignore_intersection = true; } } # endif + int num_hits = optixGetPayload_2(); + int record_index = num_hits; + const int max_hits = optixGetPayload_3(); + + if (!ignore_intersection) { + optixSetPayload_2(num_hits + 1); + } + + Intersection *const isect_array = get_payload_ptr_0<Intersection>(); + # ifdef __TRANSPARENT_SHADOWS__ - // Detect if this surface has a shader with transparent shadows - if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) { + if (num_hits >= max_hits) { + /* If maximum number of hits reached, find a hit to replace. */ + const int num_recorded_hits = min(max_hits, num_hits); + float max_recorded_t = isect_array[0].t; + int max_recorded_hit = 0; + + for (int i = 1; i < num_recorded_hits; i++) { + if (isect_array[i].t > max_recorded_t) { + max_recorded_t = isect_array[i].t; + max_recorded_hit = i; + } + } + + if (optixGetRayTmax() >= max_recorded_t) { + /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current + * hit anymore. */ + return; + } + + record_index = max_recorded_hit; + } # endif - // This is an opaque hit or the hit limit has been reached, abort traversal - optixSetPayload_5(true); - return optixTerminateRay(); + + if (!ignore_intersection) { + Intersection *const isect = isect_array + record_index; + isect->u = u; + isect->v = v; + isect->t = optixGetRayTmax(); + isect->prim = prim; + isect->object = get_object_id(); + isect->type = kernel_tex_fetch(__prim_type, prim); + +# ifdef __TRANSPARENT_SHADOWS__ + // Detect if this surface has a shader with transparent shadows + if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) { +# endif + // If no transparent shadows, all light is blocked and we can stop immediately + optixSetPayload_5(true); + return optixTerminateRay(); # ifdef __TRANSPARENT_SHADOWS__ + } +# endif } - optixSetPayload_2(optixGetPayload_2() + 1); // num_hits++ - // Continue tracing optixIgnoreIntersection(); -# endif #endif } @@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type if (isect.t != FLT_MAX) isect.t *= len; - if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) { + if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) { optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL, __float_as_int(isect.u), // Attribute_0 @@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon() optix_intersection_curve(prim, type); } } - -extern "C" __global__ void __intersection__curve_all() -{ - const uint prim = optixGetPrimitiveIndex(); - const uint type = kernel_tex_fetch(__prim_type, prim); - optix_intersection_curve(prim, type); -} #endif diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu new file mode 100644 index 00000000000..bf787e29eaa --- /dev/null +++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu @@ -0,0 +1,29 @@ +/* + * Copyright 2021, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copy of the regular kernels with additional shader ray-tracing kernel that takes + * much longer to compiler. This is only loaded when needed by the scene. */ + +#include "kernel/device/optix/kernel.cu" +#include "kernel/integrator/integrator_shade_surface.h" + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer); +} diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h deleted file mode 100644 index b067e53a8bf..00000000000 --- a/intern/cycles/kernel/filter/filter.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FILTER_H__ -#define __FILTER_H__ - -/* CPU Filter Kernel Interface */ - -#include "util/util_types.h" - -#include "kernel/filter/filter_defines.h" - -CCL_NAMESPACE_BEGIN - -#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z -#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) -#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) - -#define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/filter_cpu.h" - -CCL_NAMESPACE_END - -#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h deleted file mode 100644 index 1c0ac5e2cb7..00000000000 --- a/intern/cycles/kernel/filter/filter_defines.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FILTER_DEFINES_H__ -#define __FILTER_DEFINES_H__ - -#define DENOISE_FEATURES 11 -#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES) -#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2) -#define XTWY_SIZE (DENOISE_FEATURES + 1) - -#define DENOISE_MAX_FRAMES 16 - -typedef struct TileInfo { - int offsets[9]; - int strides[9]; - int x[4]; - int y[4]; - int from_render; - int frames[DENOISE_MAX_FRAMES]; - int num_frames; - /* TODO(lukas): CUDA doesn't have uint64_t... */ -#ifdef __KERNEL_OPENCL__ - ccl_global float *buffers[9]; -#else - long long int buffers[9]; -#endif -} TileInfo; - -#ifdef __KERNEL_OPENCL__ -# define CCL_FILTER_TILE_INFO \ - ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \ - ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \ - ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \ - ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \ - ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9 -# define CCL_FILTER_TILE_INFO_ARG \ - tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \ - tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9 -# define ccl_get_tile_buffer(id) \ - (id == 0 ? tile_buffer_1 : \ - id == 1 ? tile_buffer_2 : \ - id == 2 ? tile_buffer_3 : \ - id == 3 ? tile_buffer_4 : \ - id == 4 ? tile_buffer_5 : \ - id == 5 ? tile_buffer_6 : \ - id == 6 ? tile_buffer_7 : \ - id == 7 ? tile_buffer_8 : \ - tile_buffer_9) -#else -# ifdef __KERNEL_CUDA__ -# define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info -# else -# define CCL_FILTER_TILE_INFO TileInfo *tile_info -# endif -# define ccl_get_tile_buffer(id) (tile_info->buffers[id]) -#endif - -#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h deleted file mode 100644 index 8a2af957146..00000000000 --- a/intern/cycles/kernel/filter/filter_features.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] - -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always - * points to the current pixel in the first pass. Repeat the loop for every secondary frame if - * there are any. */ -#define FOR_PIXEL_WINDOW \ - for (int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ - frame * frame_stride; \ - for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { - -#define END_FOR_PIXEL_WINDOW \ - } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } - -ccl_device_inline void filter_get_features(int3 pixel, - const ccl_global float *ccl_restrict buffer, - float *features, - bool use_time, - const float *ccl_restrict mean, - int pass_stride) -{ - features[0] = pixel.x; - features[1] = pixel.y; - features[2] = fabsf(ccl_get_feature(buffer, 0)); - features[3] = ccl_get_feature(buffer, 1); - features[4] = ccl_get_feature(buffer, 2); - features[5] = ccl_get_feature(buffer, 3); - features[6] = ccl_get_feature(buffer, 4); - features[7] = ccl_get_feature(buffer, 5); - features[8] = ccl_get_feature(buffer, 6); - features[9] = ccl_get_feature(buffer, 7); - if (use_time) { - features[10] = pixel.z; - } - if (mean) { - for (int i = 0; i < (use_time ? 11 : 10); i++) { - features[i] -= mean[i]; - } - } -} - -ccl_device_inline void filter_get_feature_scales(int3 pixel, - const ccl_global float *ccl_restrict buffer, - float *scales, - bool use_time, - const float *ccl_restrict mean, - int pass_stride) -{ - scales[0] = fabsf(pixel.x - mean[0]); - scales[1] = fabsf(pixel.y - mean[1]); - scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); - scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], - ccl_get_feature(buffer, 2) - mean[4], - ccl_get_feature(buffer, 3) - mean[5])); - scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); - scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], - ccl_get_feature(buffer, 6) - mean[8], - ccl_get_feature(buffer, 7) - mean[9])); - if (use_time) { - scales[6] = fabsf(pixel.z - mean[10]); - } -} - -ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) -{ - scale[0] = 1.0f / max(scale[0], 0.01f); - scale[1] = 1.0f / max(scale[1], 0.01f); - scale[2] = 1.0f / max(scale[2], 0.01f); - if (use_time) { - scale[10] = 1.0f / max(scale[6], 0.01f); - } - scale[6] = 1.0f / max(scale[4], 0.01f); - scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f); - scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f); -} - -ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, - int pass_stride) -{ - return make_float3( - ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); -} - -ccl_device_inline void design_row_add(float *design_row, - int rank, - const ccl_global float *ccl_restrict transform, - int stride, - int row, - float feature, - int transform_row_stride) -{ - for (int i = 0; i < rank; i++) { - design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature; - } -} - -/* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform( - int3 p_pixel, - const ccl_global float *ccl_restrict p_buffer, - int3 q_pixel, - const ccl_global float *ccl_restrict q_buffer, - int pass_stride, - int rank, - float *design_row, - const ccl_global float *ccl_restrict transform, - int stride, - bool use_time) -{ - int num_features = use_time ? 11 : 10; - - design_row[0] = 1.0f; - math_vector_zero(design_row + 1, rank); - -#define DESIGN_ROW_ADD(I, F) \ - design_row_add(design_row, rank, transform, stride, I, F, num_features); - DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); - DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); - DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); - if (use_time) { - DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) - } -#undef DESIGN_ROW_ADD -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h deleted file mode 100644 index 59d4ace2bef..00000000000 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) - -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. - * pixel_buffer always points to the first of the 4 current pixel in the first pass. - * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set - * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */ -#define FOR_PIXEL_WINDOW_SSE \ - for (int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ - frame * frame_stride; \ - float4 t4 = make_float4(pixel.z); \ - for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); - -#define END_FOR_PIXEL_WINDOW_SSE \ - } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } - -ccl_device_inline void filter_get_features_sse(float4 x, - float4 y, - float4 t, - int4 active_pixels, - const float *ccl_restrict buffer, - float4 *features, - bool use_time, - const float4 *ccl_restrict mean, - int pass_stride) -{ - int num_features = use_time ? 11 : 10; - - features[0] = x; - features[1] = y; - features[2] = fabs(ccl_get_feature_sse(0)); - features[3] = ccl_get_feature_sse(1); - features[4] = ccl_get_feature_sse(2); - features[5] = ccl_get_feature_sse(3); - features[6] = ccl_get_feature_sse(4); - features[7] = ccl_get_feature_sse(5); - features[8] = ccl_get_feature_sse(6); - features[9] = ccl_get_feature_sse(7); - if (use_time) { - features[10] = t; - } - - if (mean) { - for (int i = 0; i < num_features; i++) { - features[i] = features[i] - mean[i]; - } - } - for (int i = 0; i < num_features; i++) { - features[i] = mask(active_pixels, features[i]); - } -} - -ccl_device_inline void filter_get_feature_scales_sse(float4 x, - float4 y, - float4 t, - int4 active_pixels, - const float *ccl_restrict buffer, - float4 *scales, - bool use_time, - const float4 *ccl_restrict mean, - int pass_stride) -{ - scales[0] = fabs(x - mean[0]); - scales[1] = fabs(y - mean[1]); - scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); - scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) + - sqr(ccl_get_feature_sse(3) - mean[5]); - scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); - scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + - sqr(ccl_get_feature_sse(7) - mean[9]); - if (use_time) { - scales[6] = fabs(t - mean[10]); - } - - for (int i = 0; i < (use_time ? 7 : 6); i++) - scales[i] = mask(active_pixels, scales[i]); -} - -ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) -{ - scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); - scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); - scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); - if (use_time) { - scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f))); - } - scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); - scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); - scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h deleted file mode 100644 index 2ef03dc0a02..00000000000 --- a/intern/cycles/kernel/filter/filter_kernel.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "util/util_color.h" -#include "util/util_math.h" -#include "util/util_math_fast.h" -#include "util/util_texture.h" - -#include "util/util_atomic.h" -#include "util/util_math_matrix.h" - -#include "kernel/filter/filter_defines.h" - -#include "kernel/filter/filter_features.h" -#ifdef __KERNEL_SSE3__ -# include "kernel/filter/filter_features_sse.h" -#endif - -#include "kernel/filter/filter_prefilter.h" - -#ifdef __KERNEL_GPU__ -# include "kernel/filter/filter_transform_gpu.h" -#else -# ifdef __KERNEL_SSE3__ -# include "kernel/filter/filter_transform_sse.h" -# else -# include "kernel/filter/filter_transform.h" -# endif -#endif - -#include "kernel/filter/filter_reconstruction.h" - -#ifdef __KERNEL_CPU__ -# include "kernel/filter/filter_nlm_cpu.h" -#else -# include "kernel/filter/filter_nlm_gpu.h" -#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h deleted file mode 100644 index 24200c29203..00000000000 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs)))) -#define load4_u(buf, ofs) load_float4((buf) + (ofs)) - -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, - int dy, - const float *ccl_restrict weight_image, - const float *ccl_restrict variance_image, - const float *ccl_restrict scale_image, - float *difference_image, - int4 rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - /* Strides need to be aligned to 16 bytes. */ - kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); - - int aligned_lowx = rect.x & (~3); - const int numChannels = (channel_offset > 0) ? 3 : 1; - const float4 channel_fac = make_float4(1.0f / numChannels); - - for (int y = rect.y; y < rect.w; y++) { - int idx_p = y * stride + aligned_lowx; - int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset; - for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { - float4 diff = make_float4(0.0f); - float4 scale_fac; - if (scale_image) { - scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), - make_float4(0.25f), - make_float4(4.0f)); - } - else { - scale_fac = make_float4(1.0f); - } - for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { - /* idx_p is guaranteed to be aligned, but idx_q isn't. */ - float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs); - float4 cdiff = color_p - color_q; - float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs); - diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) / - (make_float4(1e-8f) + k_2 * (var_p + var_q)); - } - load4_a(difference_image, idx_p) = diff * channel_fac; - } - } -} - -ccl_device_inline void kernel_filter_nlm_blur( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - const int low = max(rect.y, y - f); - const int high = min(rect.w, y + f + 1); - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = make_float4(0.0f); - } - for (int y1 = low; y1 < high; y1++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x); - } - } - float fac = 1.0f / (high - low); - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) *= fac; - } - } -} - -ccl_device_inline void nlm_blur_horizontal( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = make_float4(0.0f); - } - } - - for (int dx = -f; dx <= f; dx++) { - aligned_lowx = round_down(rect.x - min(0, dx), 4); - int highx = rect.z - max(0, dx); - int4 lowx4 = make_int4(rect.x - min(0, dx)); - int4 highx4 = make_int4(rect.z - max(0, dx)); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < highx; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= lowx4) & (x4 < highx4); - - float4 diff = load4_u(difference_image, y * stride + x + dx); - load4_a(out_image, y * stride + x) += mask(active, diff); - } - } - } - - aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); - float4 low = max(make_float4(rect.x), x4 - make_float4(f)); - float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1)); - load4_a(out_image, y * stride + x) *= rcp(high - low); - } - } -} - -ccl_device_inline void kernel_filter_nlm_calc_weight( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - nlm_blur_horizontal(difference_image, out_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = fast_expf4( - -max(load4_a(out_image, y * stride + x), make_float4(0.0f))); - } - } -} - -ccl_device_inline void kernel_filter_nlm_update_output(int dx, - int dy, - const float *ccl_restrict difference_image, - const float *ccl_restrict image, - float *temp_image, - float *out_image, - float *accum_image, - int4 rect, - int channel_offset, - int stride, - int f) -{ - nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); - - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - - float4 weight = load4_a(temp_image, idx_p); - load4_a(accum_image, idx_p) += mask(active, weight); - - float4 val = load4_u(image, idx_q); - if (channel_offset) { - val += load4_u(image, idx_q + channel_offset); - val += load4_u(image, idx_q + 2 * channel_offset); - val *= 1.0f / 3.0f; - } - - load4_a(out_image, idx_p) += mask(active, weight * val); - } - } -} - -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, - int dy, - int t, - const float *ccl_restrict - difference_image, - const float *ccl_restrict buffer, - float *transform, - int *rank, - float *XtWX, - float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time) -{ - int4 clip_area = rect_clip(rect, filter_window); - /* fy and fy are in filter-window-relative coordinates, - * while x and y are in feature-window-relative coordinates. */ - for (int y = clip_area.y; y < clip_area.w; y++) { - for (int x = clip_area.x; x < clip_area.z; x++) { - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - float sum = 0.0f; - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - float weight = sum * (1.0f / (high - low)); - - int storage_ofs = coord_to_local_index(filter_window, x, y); - float *l_transform = transform + storage_ofs * TRANSFORM_SIZE; - float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE; - float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE; - int *l_rank = rank + storage_ofs; - - kernel_filter_construct_gramian(x, - y, - 1, - dx, - dy, - t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - l_transform, - l_rank, - weight, - l_XtWX, - l_XtWY, - 0); - } - } -} - -ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, - const float *ccl_restrict accum_image, - int4 rect, - int w) -{ - for (int y = rect.y; y < rect.w; y++) { - for (int x = rect.x; x < rect.z; x++) { - out_image[y * w + x] /= accum_image[y * w + x]; - } - } -} - -#undef load4_a -#undef load4_u - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h deleted file mode 100644 index 650c743f34f..00000000000 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Determines pixel coordinates and offset for the current thread. - * Returns whether the thread should do any work. - * - * All coordinates are relative to the denoising buffer! - * - * Window is the rect that should be processed. - * co is filled with (x, y, dx, dy). - */ -ccl_device_inline bool get_nlm_coords_window( - int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window) -{ - /* Determine the pixel offset that this thread should apply. */ - int s = 2 * r + 1; - int si = ccl_global_id(1); - int sx = si % s; - int sy = si / s; - if (sy >= s) { - return false; - } - - /* Pixels still need to lie inside the denoising buffer after applying the offset, - * so determine the area for which this is the case. */ - int dx = sx - r; - int dy = sy - r; - - *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy)); - - /* Find the intersection of the area that we want to process (window) and the area - * that can be processed (rect) to get the final area for this offset. */ - int4 clip_area = rect_clip(window, *rect); - - /* If the radius is larger than one of the sides of the window, - * there will be shifts for which there is no usable pixel at all. */ - if (!rect_is_valid(clip_area)) { - return false; - } - - /* Map the linear thread index to pixels inside the clip area. */ - int x, y; - if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { - return false; - } - - *co = make_int4(x, y, dx, dy); - - *ofs = (sy * s + sx) * stride; - - return true; -} - -ccl_device_inline bool get_nlm_coords( - int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs) -{ - return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); -} - -ccl_device_inline void kernel_filter_nlm_calc_difference( - int x, - int y, - int dx, - int dy, - const ccl_global float *ccl_restrict weight_image, - const ccl_global float *ccl_restrict variance_image, - const ccl_global float *ccl_restrict scale_image, - ccl_global float *difference_image, - int4 rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset; - int numChannels = channel_offset ? 3 : 1; - - float diff = 0.0f; - float scale_fac = 1.0f; - if (scale_image) { - scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); - } - - for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { - float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q]; - float pvar = variance_image[idx_p]; - float qvar = sqr(scale_fac) * variance_image[idx_q]; - diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar)); - } - if (numChannels > 1) { - diff *= 1.0f / numChannels; - } - difference_image[y * stride + x] = diff; -} - -ccl_device_inline void kernel_filter_nlm_blur(int x, - int y, - const ccl_global float *ccl_restrict - difference_image, - ccl_global float *out_image, - int4 rect, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.y, y - f); - const int high = min(rect.w, y + f + 1); - for (int y1 = low; y1 < high; y1++) { - sum += difference_image[y1 * stride + x]; - } - sum *= 1.0f / (high - low); - out_image[y * stride + x] = sum; -} - -ccl_device_inline void kernel_filter_nlm_calc_weight(int x, - int y, - const ccl_global float *ccl_restrict - difference_image, - ccl_global float *out_image, - int4 rect, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - sum *= 1.0f / (high - low); - out_image[y * stride + x] = fast_expf(-max(sum, 0.0f)); -} - -ccl_device_inline void kernel_filter_nlm_update_output(int x, - int y, - int dx, - int dy, - const ccl_global float *ccl_restrict - difference_image, - const ccl_global float *ccl_restrict image, - ccl_global float *out_image, - ccl_global float *accum_image, - int4 rect, - int channel_offset, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - sum *= 1.0f / (high - low); - - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - if (out_image) { - atomic_add_and_fetch_float(accum_image + idx_p, sum); - - float val = image[idx_q]; - if (channel_offset) { - val += image[idx_q + channel_offset]; - val += image[idx_q + 2 * channel_offset]; - val *= 1.0f / 3.0f; - } - atomic_add_and_fetch_float(out_image + idx_p, sum * val); - } - else { - accum_image[idx_p] = sum; - } -} - -ccl_device_inline void kernel_filter_nlm_construct_gramian( - int x, - int y, - int dx, - int dy, - int t, - const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time, - int localIdx) -{ - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - float sum = 0.0f; - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - float weight = sum * (1.0f / (high - low)); - - /* Reconstruction data is only stored for pixels inside the filter window, - * so compute the pixels's index in there. */ - int storage_ofs = coord_to_local_index(filter_window, x, y); - transform += storage_ofs; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - - kernel_filter_construct_gramian(x, - y, - rect_size(filter_window), - dx, - dy, - t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - transform, - rank, - weight, - XtWX, - XtWY, - localIdx); -} - -ccl_device_inline void kernel_filter_nlm_normalize(int x, - int y, - ccl_global float *out_image, - const ccl_global float *ccl_restrict - accum_image, - int stride) -{ - out_image[y * stride + x] /= accum_image[y * stride + x]; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h deleted file mode 100644 index 97cecba190e..00000000000 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/** - * First step of the shadow prefiltering, performs the shadow division and stores all data - * in a nice and easy rectangular array that can be passed to the NLM filter. - * - * Calculates: - * \param unfiltered: Contains the two half images of the shadow feature pass - * \param sampleVariance: The sample-based variance calculated in the kernel. - * Note: This calculation is biased in general, - * and especially here since the variance of the ratio can only be approximated. - * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy - * (since it's essentially the buffer variance of the two variance halves) - * \param bufferVariance: The buffer-based variance of the shadow feature. - * Unbiased, but quite noisy. - */ -ccl_device void kernel_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - int x, - int y, - ccl_global float *unfilteredA, - ccl_global float *unfilteredB, - ccl_global float *sampleVariance, - ccl_global float *sampleVarianceV, - ccl_global float *bufferVariance, - int4 rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int tile = ytile * 3 + xtile; - - int offset = tile_info->offsets[tile]; - int stride = tile_info->strides[tile]; - const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer( - tile); - center_buffer += (y * stride + x + offset) * buffer_pass_stride; - center_buffer += buffer_denoising_offset + 14; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); - unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); - - float varA = center_buffer[2]; - float varB = center_buffer[5]; - int odd_sample = (sample + 1) / 2; - int even_sample = sample / 2; - - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample); - varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample); - - varA /= max(odd_sample - 1, 1); - varB /= max(even_sample - 1, 1); - - sampleVariance[idx] = 0.5f * (varA + varB) / sample; - sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample); - bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * - (unfilteredA[idx] - unfilteredB[idx]); -} - -/* Load a regular feature from the render buffers into the denoise buffer. - * Parameters: - * - sample: The sample amount in the buffer, used to normalize the buffer. - * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. - * - x, y: Current pixel - * - mean, variance: Target denoise buffers. - * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). - */ -ccl_device void kernel_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - int x, - int y, - ccl_global float *mean, - ccl_global float *variance, - float scale, - int4 rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int tile = ytile * 3 + xtile; - ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) + - (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) * - buffer_pass_stride + - buffer_denoising_offset; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - float val = scale * center_buffer[m_offset]; - mean[idx] = val; - - if (v_offset >= 0) { - if (sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max( - 0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; - } - } -} - -ccl_device void kernel_filter_write_feature(int sample, - int x, - int y, - int4 buffer_params, - ccl_global float *from, - ccl_global float *buffer, - int out_offset, - int4 rect) -{ - ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * - buffer_params.z; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - combined_buffer[out_offset] = from[idx]; -} - -#define GET_COLOR(image) \ - make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]) -#define SET_COLOR(image, color) \ - image[idx] = color.x; \ - image[idx + pass_stride] = color.y; \ - image[idx + 2 * pass_stride] = color.z - -ccl_device void kernel_filter_detect_outliers(int x, - int y, - ccl_global float *in, - ccl_global float *variance_out, - ccl_global float *depth, - ccl_global float *image_out, - int4 rect, - int pass_stride) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - ccl_global float *image_in = in; - ccl_global float *variance_in = in + 3 * pass_stride; - - int n = 0; - float values[25]; - float pixel_variance, max_variance = 0.0f; - for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) { - for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) { - int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x); - float3 color = GET_COLOR(image_in); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - float L = average(color); - - /* Find the position of L. */ - int i; - for (i = 0; i < n; i++) { - if (values[i] > L) - break; - } - /* Make space for L by shifting all following values to the right. */ - for (int j = n; j > i; j--) { - values[j] = values[j - 1]; - } - /* Insert L. */ - values[i] = L; - n++; - - float3 pixel_var = GET_COLOR(variance_in); - float var = average(pixel_var); - if ((x1 == x) && (y1 == y)) { - pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f : - var; - } - else { - max_variance = max(max_variance, var); - } - } - } - - max_variance += 1e-4f; - - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - float3 color = GET_COLOR(image_in); - float3 variance = GET_COLOR(variance_in); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - variance = max(variance, make_float3(0.0f, 0.0f, 0.0f)); - - float L = average(color); - - float ref = 2.0f * values[(int)(n * 0.75f)]; - - /* Slightly offset values to avoid false positives in (almost) black areas. */ - max_variance += 1e-5f; - ref -= 1e-5f; - - if (L > ref) { - /* The pixel appears to be an outlier. - * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is - * that the pixel should actually be at the reference value: If the reference is within the - * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very - * unlikely that the pixel should be darker, which indicates a legitimate highlight. - */ - - if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { - depth[idx] = -depth[idx]; - color *= ref / L; - variance = make_float3(max_variance, max_variance, max_variance); - } - else { - float stddev = sqrtf(pixel_variance); - if (L - 3 * stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM - * weights. */ - depth[idx] = -depth[idx]; - float fac = ref / L; - color *= fac; - variance *= sqr(fac); - } - } - } - - /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results. - * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we - * scale by the square of that (since we have variance instead of standard deviation). */ - color = color_highlight_compress(color, &variance); - - SET_COLOR(image_out, color); - SET_COLOR(variance_out, variance); -} - -#undef GET_COLOR -#undef SET_COLOR - -/* Combine A/B buffers. - * Calculates the combined mean and the buffer variance. */ -ccl_device void kernel_filter_combine_halves(int x, - int y, - ccl_global float *mean, - ccl_global float *variance, - ccl_global float *a, - ccl_global float *b, - int4 rect, - int r) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - if (mean) - mean[idx] = 0.5f * (a[idx] + b[idx]); - if (variance) { - if (r == 0) - variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]); - else { - variance[idx] = 0.0f; - float values[25]; - int numValues = 0; - for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) { - for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) { - int pidx = (py - rect.y) * buffer_w + (px - rect.x); - values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]); - } - } - /* Insertion-sort the variances (fast enough for 25 elements). */ - for (int i = 1; i < numValues; i++) { - float v = values[i]; - int j; - for (j = i - 1; j >= 0 && values[j] > v; j--) - values[j + 1] = values[j]; - values[j + 1] = v; - } - variance[idx] = values[(7 * numValues) / 8]; - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h deleted file mode 100644 index 17941689ad5..00000000000 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void kernel_filter_construct_gramian(int x, - int y, - int storage_stride, - int dx, - int dy, - int t, - int buffer_stride, - int pass_stride, - int frame_offset, - bool use_time, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict - transform, - ccl_global int *rank, - float weight, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int localIdx) -{ - if (weight < 1e-3f) { - return; - } - - int p_offset = y * buffer_stride + x; - int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset; - -#ifdef __KERNEL_GPU__ - const int stride = storage_stride; -#else - const int stride = 1; - (void)storage_stride; -#endif - -#ifdef __KERNEL_CUDA__ - ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1); -#else - float design_row[DENOISE_FEATURES + 1]; -#endif - - float3 q_color = filter_get_color(buffer + q_offset, pass_stride); - - /* If the pixel was flagged as an outlier during prefiltering, skip it. */ - if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) { - return; - } - - filter_get_design_row_transform(make_int3(x, y, t), - buffer + p_offset, - make_int3(x + dx, y + dy, t), - buffer + q_offset, - pass_stride, - *rank, - design_row, - transform, - stride, - use_time); - -#ifdef __KERNEL_GPU__ - math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride); - math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride); -#else - math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight); - math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color); -#endif -} - -ccl_device_inline void kernel_filter_finalize(int x, - int y, - ccl_global float *buffer, - ccl_global int *rank, - int storage_stride, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 buffer_params, - int sample) -{ -#ifdef __KERNEL_GPU__ - const int stride = storage_stride; -#else - const int stride = 1; - (void)storage_stride; -#endif - - if (XtWX[0] < 1e-3f) { - /* There is not enough information to determine a denoised result. - * As a fallback, keep the original value of the pixel. */ - return; - } - - /* The weighted average of pixel colors (essentially, the NLM-filtered image). - * In case the solution of the linear model fails due to numerical issues or - * returns nonsensical negative values, fall back to this value. */ - float3 mean_color = XtWY[0] / XtWX[0]; - - math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride); - - float3 final_color = XtWY[0]; - if (!isfinite3_safe(final_color) || - (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) { - final_color = mean_color; - } - - /* Clamp pixel value to positive values and reverse the highlight compression transform. */ - final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f))); - - ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * - buffer_params.z; - if (buffer_params.w >= 0) { - final_color *= sample; - if (buffer_params.w > 0) { - final_color.x += combined_buffer[buffer_params.w + 0]; - final_color.y += combined_buffer[buffer_params.w + 1]; - final_color.z += combined_buffer[buffer_params.w + 2]; - } - } - combined_buffer[0] = final_color.x; - combined_buffer[1] = final_color.y; - combined_buffer[2] = final_color.z; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h deleted file mode 100644 index 880a661214e..00000000000 --- a/intern/cycles/kernel/filter/filter_transform.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - float *transform, - int *rank, - int radius, - float pca_threshold) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - float features[DENOISE_FEATURES]; - - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } - END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW - { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } - END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i * num_features, feature_scale, num_features); - } - math_matrix_transpose(transform, num_features, 1); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h deleted file mode 100644 index ec258a5212a..00000000000 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - ccl_global float *transform, - ccl_global int *rank, - int radius, - float pca_threshold, - int transform_stride, - int localIdx) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - -#ifdef __KERNEL_CUDA__ - ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES; -#else - float features[DENOISE_FEATURES]; -#endif - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - const ccl_global float *ccl_restrict pixel_buffer; - int3 pixel; - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } - END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW - { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } - END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, transform_stride); - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < num_features; i++) { - for (int j = 0; j < (*rank); j++) { - transform[(i * num_features + j) * transform_stride] *= feature_scale[i]; - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h deleted file mode 100644 index 0304d990f9f..00000000000 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - float *transform, - int *rank, - int radius, - float pca_threshold) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - float4 features[DENOISE_FEATURES]; - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_features_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add_sse(feature_means, num_features, features); - } - END_FOR_PIXEL_WINDOW_SSE - - float4 pixel_scale = make_float4(1.0f / num_pixels); - for (int i = 0; i < num_features; i++) { - feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; - } - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_feature_scales_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW_SSE - - filter_calculate_scale_sse(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_features_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul_sse(features, num_features, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); - } - END_FOR_PIXEL_WINDOW_SSE - - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, 1); - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < num_features; i++) { - math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index 5ff4d5f7053..4de824cc277 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + // clang-format off #include "kernel/geom/geom_attribute.h" #include "kernel/geom/geom_object.h" @@ -31,4 +33,5 @@ #include "kernel/geom/geom_curve_intersect.h" #include "kernel/geom/geom_volume.h" #include "kernel/geom/geom_primitive.h" +#include "kernel/geom/geom_shader_data.h" // clang-format on diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index b37797ac21b..9532a21fec7 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Attributes @@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN * Lookup of attributes is different between OSL and SVM, as OSL is ustring * based while for SVM we use integer ids. */ -ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd); +ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd); -ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd) +ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd) { if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) { return ATTR_PRIM_SUBD; @@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found() /* Find attribute based on ID */ -ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object) +ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object) { return kernel_tex_fetch(__objects, object).attribute_map_offset; } -ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, +ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg, const ShaderData *sd, uint id) { @@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, /* Transform matrix attribute on meshes */ -ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, +ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc) { diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index b5a62a31ca9..a827a67ce7a 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -12,6 +12,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Curve Primitive @@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN /* Reading attributes on various curve elements */ -ccl_device float curve_attribute_float( - KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) +ccl_device float curve_attribute_float(const KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float *dx, + float *dy) { if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) { float4 curvedata = kernel_tex_fetch(__curves, sd->prim); @@ -64,7 +69,7 @@ ccl_device float curve_attribute_float( } } -ccl_device float2 curve_attribute_float2(KernelGlobals *kg, +ccl_device float2 curve_attribute_float2(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, @@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg, } } -ccl_device float3 curve_attribute_float3(KernelGlobals *kg, +ccl_device float3 curve_attribute_float3(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, @@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, } } -ccl_device float4 curve_attribute_float4(KernelGlobals *kg, +ccl_device float4 curve_attribute_float4(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float4 *dx, @@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg, /* Curve thickness */ -ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) +ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd) { float r = 0.0f; @@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) /* Curve location for motion pass, linear interpolation between keys and * ignoring radius because we do the same for the motion keys */ -ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd) { float4 curvedata = kernel_tex_fetch(__curves, sd->prim); int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); @@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd /* Curve tangent normal */ -ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd) { float3 tgN = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h index e25bf5b4660..213f3e62ee0 100644 --- a/intern/cycles/kernel/geom/geom_curve_intersect.h +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -15,6 +15,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Curve primitive intersection functions. @@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co } ccl_device bool curve_intersect_iterative(const float3 ray_dir, + float *ray_tfar, const float dt, const float4 curve[4], float u, @@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, if (fabsf(f) < f_err && fabsf(g) < g_err) { t += dt; - if (!(0.0f <= t && t <= isect->t)) { + if (!(0.0f <= t && t <= *ray_tfar)) { return false; /* Rejects NaNs */ } if (!(u >= 0.0f && u <= 1.0f)) { @@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, } /* Record intersection. */ + *ray_tfar = t; isect->t = t; isect->u = u; isect->v = 0.0f; @@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, ccl_device bool curve_intersect_recursive(const float3 ray_orig, const float3 ray_dir, + float ray_tfar, float4 curve[4], Intersection *isect) { @@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig, } /* Intersect with cap-planes. */ - float2 tp = make_float2(-dt, isect->t - dt); + float2 tp = make_float2(-dt, ray_tfar - dt); tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y)); const float2 h0 = half_plane_intersect( float4_to_float3(P0), float4_to_float3(dP0du), ray_dir); @@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig, CURVE_NUM_BEZIER_SUBDIVISIONS; if (depth >= termDepth) { found |= curve_intersect_iterative( - ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect); + ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect); } else { recurse = true; } } - if (valid1 && (tp1.x + dt <= isect->t)) { + if (valid1 && (tp1.x + dt <= ray_tfar)) { const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE : CURVE_NUM_BEZIER_SUBDIVISIONS; if (depth >= termDepth) { found |= curve_intersect_iterative( - ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect); + ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect); } else { recurse = true; @@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3], ccl_device_inline bool ribbon_intersect(const float3 ray_org, const float3 ray_dir, - const float ray_tfar, + float ray_tfar, const int N, float4 curve[4], Intersection *isect) @@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org, /* Intersect quad. */ float vu, vv, vt; - bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt); + bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt); if (valid0) { /* ignore self intersections */ @@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org, vv = 2.0f * vv - 1.0f; /* Record intersection. */ + ray_tfar = vt; isect->t = vt; isect->u = u + vu * step_size; isect->v = vv; @@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org, return false; } -ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, +ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg, Intersection *isect, const float3 P, const float3 dir, + const float tmax, uint visibility, int object, int curveAddr, @@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { /* todo: adaptive number of subdivisions could help performance here. */ const int subdivisions = kernel_data.bvh.curve_subdivisions; - if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) { + if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) { isect->prim = curveAddr; isect->object = object; isect->type = type; @@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, return false; } else { - if (curve_intersect_recursive(P, dir, curve, isect)) { + if (curve_intersect_recursive(P, dir, tmax, curve, isect)) { isect->prim = curveAddr; isect->object = object; isect->type = type; @@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, } } -ccl_device_inline void curve_shader_setup(KernelGlobals *kg, +ccl_device_inline void curve_shader_setup(const KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, - const Ray *ray) + float3 P, + float3 D, + float t, + const int isect_object, + const int isect_prim) { - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_inverse_transform(kg, sd); P = transform_point(&tfm, P); D = transform_direction(&tfm, D * t); D = normalize_len(D, &t); } - int prim = kernel_tex_fetch(__prim_index, isect->prim); + int prim = kernel_tex_fetch(__prim_index, isect_prim); float4 v00 = kernel_tex_fetch(__curves, prim); int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); @@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg, motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); } - sd->u = isect->u; - P = P + D * t; - const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u); + const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u); const float3 dPdu = float4_to_float3(dPdu4); if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { /* Rounded smooth normals for ribbons, to approximate thick curve shape. */ const float3 tangent = normalize(dPdu); const float3 bitangent = normalize(cross(tangent, -D)); - const float sine = isect->v; + const float sine = sd->v; const float cosine = safe_sqrtf(1.0f - sine * sine); sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent))); sd->Ng = -D; - sd->v = isect->v; # if 0 /* This approximates the position and geometric normal of a thick curve too, @@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg, /* Thick curves, compute normal using direction from inside the curve. * This could be optimized by recording the normal in the intersection, * however for Optix this would go beyond the size of the payload. */ - const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u)); + const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u)); const float3 Ng = normalize(P - P_inside); sd->N = Ng; @@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg, sd->dPdv = cross(dPdu, sd->Ng); # endif - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_transform(kg, sd); P = transform_point(&tfm, P); } diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h index 0f66f4af755..5294da03145 100644 --- a/intern/cycles/kernel/geom/geom_motion_curve.h +++ b/intern/cycles/kernel/geom/geom_motion_curve.h @@ -12,6 +12,8 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Motion Curve Primitive @@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN #ifdef __HAIR__ -ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, +ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg, int object, uint id, AttributeElement *elem) @@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z; } -ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg, +ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg, int offset, int numkeys, int numsteps, @@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg, /* return 2 curve key locations */ ccl_device_inline void motion_curve_keys_linear( - KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2]) + const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2]) { /* get motion info */ int numsteps, numkeys; @@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear( keys[1] = (1.0f - t) * keys[1] + t * next_keys[1]; } -ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, +ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg, int offset, int numkeys, int numsteps, @@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, } /* return 2 curve key locations */ -ccl_device_inline void motion_curve_keys(KernelGlobals *kg, +ccl_device_inline void motion_curve_keys(const KernelGlobals *kg, int object, int prim, float time, diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index 53d6b92dd7e..eb4a39e062b 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -25,11 +25,13 @@ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Time interpolation of vertex positions and normals */ -ccl_device_inline int find_attribute_motion(KernelGlobals *kg, +ccl_device_inline int find_attribute_motion(const KernelGlobals *kg, int object, uint id, AttributeElement *elem) @@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg, return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z; } -ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, +ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, @@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg, } } -ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, +ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg, uint4 tri_vindex, int offset, int numverts, @@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, } ccl_device_inline void motion_triangle_vertices( - KernelGlobals *kg, int object, int prim, float time, float3 verts[3]) + const KernelGlobals *kg, int object, int prim, float time, float3 verts[3]) { /* get motion info */ int numsteps, numverts; @@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices( } ccl_device_inline float3 motion_triangle_smooth_normal( - KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time) + const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time) { /* get motion info */ int numsteps, numverts; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h index 859d919f0bb..ec7e4b07d76 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -25,6 +25,8 @@ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Refine triangle intersection to more precise hit point. For rays that travel @@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN * a closer distance. */ -ccl_device_inline float3 motion_triangle_refine( - KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3]) +ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg, + ShaderData *sd, + float3 P, + float3 D, + float t, + const int isect_object, + const int isect_prim, + float3 verts[3]) { - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - #ifdef __INTERSECTION_REFINE__ - if (isect->object != OBJECT_NONE) { + if (isect_object != OBJECT_NONE) { if (UNLIKELY(t == 0.0f)) { return P; } -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif + const Transform tfm = object_get_inverse_transform(kg, sd); P = transform_point(&tfm, P); D = transform_direction(&tfm, D * t); @@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine( /* Compute refined position. */ P = P + D * rt; - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_transform(kg, sd); P = transform_point(&tfm, P); } @@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine( #endif } -/* Same as above, except that isect->t is assumed to be in object space +/* Same as above, except that t is assumed to be in object space * for instancing. */ @@ -97,27 +92,22 @@ ccl_device_noinline ccl_device_inline # endif float3 - motion_triangle_refine_local(KernelGlobals *kg, + motion_triangle_refine_local(const KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, - const Ray *ray, + float3 P, + float3 D, + float t, + const int isect_object, + const int isect_prim, float3 verts[3]) { # ifdef __KERNEL_OPTIX__ - /* isect->t is always in world space with OptiX. */ - return motion_triangle_refine(kg, sd, isect, ray, verts); + /* t is always in world space with OptiX. */ + return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts); # else - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - # ifdef __INTERSECTION_REFINE__ - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_inverse_transform(kg, sd); P = transform_point(&tfm, P); D = transform_direction(&tfm, D); @@ -138,13 +128,8 @@ ccl_device_inline P = P + D * rt; - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_transform(kg, sd); P = transform_point(&tfm, P); } @@ -160,10 +145,11 @@ ccl_device_inline * time and do a ray intersection with the resulting triangle. */ -ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, +ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg, Intersection *isect, float3 P, float3 dir, + float tmax, float time, uint visibility, int object, @@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, float t, u, v; if (ray_triangle_intersect(P, dir, - isect->t, + tmax, #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) (ssef *)verts, #else @@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, * Returns whether traversal should be stopped. */ #ifdef __BVH_LOCAL__ -ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg, +ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg, LocalIntersection *local_isect, float3 P, float3 dir, diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h index 7a91f8041f7..85c4f0ca522 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h @@ -25,6 +25,8 @@ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Setup of motion triangle specific parts of ShaderData, moved into this one @@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN * normals */ /* return 3 triangle vertex normals */ -ccl_device_noinline void motion_triangle_shader_setup( - KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local) +ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg, + ShaderData *sd, + const float3 P, + const float3 D, + const float ray_t, + const int isect_object, + const int isect_prim, + bool is_local) { /* Get shader. */ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); @@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup( /* Compute refined position. */ #ifdef __BVH_LOCAL__ if (is_local) { - sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts); + sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts); } else #endif /* __BVH_LOCAL__*/ { - sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); + sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts); } /* Compute face normal. */ float3 Ng; diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index fe73335a335..7d6ad7b4fe3 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -22,6 +22,8 @@ * directly primitives in the BVH with world space locations applied, and the object * ID is looked up afterwards. */ +#pragma once + CCL_NAMESPACE_BEGIN /* Object attributes, for now a fixed size and contents */ @@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST /* Object to world space transformation */ -ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, +ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg, int object, enum ObjectTransform type) { @@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, /* Lamp to world space transformation */ -ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse) +ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse) { if (inverse) { return kernel_tex_fetch(__lights, lamp).itfm; @@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo /* Object to world space transformation for motion vectors */ -ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg, +ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg, int object, enum ObjectVectorTransform type) { @@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg /* Motion blurred object transformations */ #ifdef __OBJECT_MOTION__ -ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, +ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg, int object, float time) { @@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, return tfm; } -ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, +ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg, int object, float time, Transform *itfm) @@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg } #endif +/* Get transform matrix for shading point. */ + +ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd) +{ +#ifdef __OBJECT_MOTION__ + return (sd->object_flag & SD_OBJECT_MOTION) ? + sd->ob_tfm_motion : + object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); +#else + return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); +#endif +} + +ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg, + const ShaderData *sd) +{ +#ifdef __OBJECT_MOTION__ + return (sd->object_flag & SD_OBJECT_MOTION) ? + sd->ob_itfm_motion : + object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); +#else + return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); +#endif +} /* Transform position from object to world space */ -ccl_device_inline void object_position_transform(KernelGlobals *kg, +ccl_device_inline void object_position_transform(const KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&sd->ob_tfm, *P); -#else + if (sd->object_flag & SD_OBJECT_MOTION) { + *P = transform_point_auto(&sd->ob_tfm_motion, *P); + return; + } +#endif + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); -#endif } /* Transform position from world to object space */ -ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, +ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&sd->ob_itfm, *P); -#else + if (sd->object_flag & SD_OBJECT_MOTION) { + *P = transform_point_auto(&sd->ob_itfm_motion, *P); + return; + } +#endif + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); -#endif } /* Transform normal from world to object space */ -ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, +ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { - *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N)); + if (sd->object_flag & SD_OBJECT_MOTION) { + if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N)); + } + return; } -#else +#endif + if (sd->object != OBJECT_NONE) { Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); @@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, Transform tfm = lamp_fetch_transform(kg, sd->lamp, false); *N = normalize(transform_direction_transposed(&tfm, *N)); } -#endif } /* Transform normal from object to world space */ -ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) +ccl_device_inline void object_normal_transform(const KernelGlobals *kg, + const ShaderData *sd, + float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N)); -#else + if (sd->object_flag & SD_OBJECT_MOTION) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N)); + return; + } +#endif + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); -#endif } /* Transform direction vector from object to world space */ -ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) +ccl_device_inline void object_dir_transform(const KernelGlobals *kg, + const ShaderData *sd, + float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&sd->ob_tfm, *D); -#else + if (sd->object_flag & SD_OBJECT_MOTION) { + *D = transform_direction_auto(&sd->ob_tfm_motion, *D); + return; + } +#endif + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); -#endif } /* Transform direction vector from world to object space */ -ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, +ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&sd->ob_itfm, *D); -#else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); - *D = transform_direction(&tfm, *D); + if (sd->object_flag & SD_OBJECT_MOTION) { + *D = transform_direction_auto(&sd->ob_itfm_motion, *D); + return; + } #endif + + const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + *D = transform_direction(&tfm, *D); } /* Object center position */ -ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) +ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd) { if (sd->object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); -#else + if (sd->object_flag & SD_OBJECT_MOTION) { + return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w); + } +#endif + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); -#endif } /* Color of the object */ -ccl_device_inline float3 object_color(KernelGlobals *kg, int object) +ccl_device_inline float3 object_color(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); @@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object) /* Pass ID number of object */ -ccl_device_inline float object_pass_id(KernelGlobals *kg, int object) +ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0.0f; @@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object) /* Per lamp random number for shader variation */ -ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp) +ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp) { if (lamp == LAMP_NONE) return 0.0f; @@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp) /* Per object random number for shader variation */ -ccl_device_inline float object_random_number(KernelGlobals *kg, int object) +ccl_device_inline float object_random_number(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0.0f; @@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object) /* Particle ID from which this object was generated */ -ccl_device_inline int object_particle_id(KernelGlobals *kg, int object) +ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0; @@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object) /* Generated texture coordinate on surface from where object was instanced */ -ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object) +ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); @@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object) /* UV texture coordinate on surface from where object was instanced */ -ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object) +ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); @@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object) /* Information about mesh for motion blurred triangles and curves */ ccl_device_inline void object_motion_info( - KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys) + const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys) { if (numkeys) { *numkeys = kernel_tex_fetch(__objects, object).numkeys; @@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info( /* Offset to an objects patch map */ -ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) +ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0; @@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) /* Volume step size */ -ccl_device_inline float object_volume_density(KernelGlobals *kg, int object) +ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) { return 1.0f; @@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object) return kernel_tex_fetch(__objects, object).volume_density; } -ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object) +ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) { return kernel_data.background.volume_step_size; @@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object) /* Pass ID for shader */ -ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) +ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd) { return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id; } /* Cryptomatte ID */ -ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object) +ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0.0f; @@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object) return kernel_tex_fetch(__objects, object).cryptomatte_object; } -ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object) +ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object) { if (object == OBJECT_NONE) return 0; @@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec /* Particle data from which object was instanced */ -ccl_device_inline uint particle_index(KernelGlobals *kg, int particle) +ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle) { return kernel_tex_fetch(__particles, particle).index; } -ccl_device float particle_age(KernelGlobals *kg, int particle) +ccl_device float particle_age(const KernelGlobals *kg, int particle) { return kernel_tex_fetch(__particles, particle).age; } -ccl_device float particle_lifetime(KernelGlobals *kg, int particle) +ccl_device float particle_lifetime(const KernelGlobals *kg, int particle) { return kernel_tex_fetch(__particles, particle).lifetime; } -ccl_device float particle_size(KernelGlobals *kg, int particle) +ccl_device float particle_size(const KernelGlobals *kg, int particle) { return kernel_tex_fetch(__particles, particle).size; } -ccl_device float4 particle_rotation(KernelGlobals *kg, int particle) +ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle) { return kernel_tex_fetch(__particles, particle).rotation; } -ccl_device float3 particle_location(KernelGlobals *kg, int particle) +ccl_device float3 particle_location(const KernelGlobals *kg, int particle) { return float4_to_float3(kernel_tex_fetch(__particles, particle).location); } -ccl_device float3 particle_velocity(KernelGlobals *kg, int particle) +ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle) { return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity); } -ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) +ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle) { return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity); } @@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir) /* Transform ray into object space to enter static object in BVH */ ccl_device_inline float bvh_instance_push( - KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t) + const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push( *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if (t != FLT_MAX) { - t *= len; - } - - return t; + return len; } /* Transform ray to exit static object in BVH. */ -ccl_device_inline float bvh_instance_pop( - KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t) +ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { if (t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop( /* Same as above, but returns scale factor to apply to multiple intersection distances */ -ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, +ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg, int object, const Ray *ray, float3 *P, @@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, +ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, - float t, Transform *itfm) { object_fetch_transform_motion_test(kg, object, ray->time, itfm); @@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if (t != FLT_MAX) { - t *= len; - } - - return t; + return len; } /* Transform ray to exit motion blurred object in BVH. */ -ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, +ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg, int object, const Ray *ray, float3 *P, @@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, /* Same as above, but returns scale factor to apply to multiple intersection distances */ -ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, +ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg, int object, const Ray *ray, float3 *P, @@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, #endif -/* TODO(sergey): This is only for until we've got OpenCL 2.0 - * on all devices we consider supported. It'll be replaced with - * generic address space. - */ +/* TODO: This can be removed when we know if no devices will require explicit + * address space qualifiers for this case. */ -#ifdef __KERNEL_OPENCL__ -ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg, - const ShaderData *sd, - ccl_addr_space float3 *P) -{ - float3 private_P = *P; - object_position_transform(kg, sd, &private_P); - *P = private_P; -} - -ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg, - const ShaderData *sd, - ccl_addr_space float3 *D) -{ - float3 private_D = *D; - object_dir_transform(kg, sd, &private_D); - *D = private_D; -} - -ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg, - const ShaderData *sd, - ccl_addr_space float3 *N) -{ - float3 private_N = *N; - object_normal_transform(kg, sd, &private_N); - *N = private_N; -} -#endif - -#ifndef __KERNEL_OPENCL__ -# define object_position_transform_auto object_position_transform -# define object_dir_transform_auto object_dir_transform -# define object_normal_transform_auto object_normal_transform -#else -# define object_position_transform_auto object_position_transform_addrspace -# define object_dir_transform_auto object_dir_transform_addrspace -# define object_normal_transform_auto object_normal_transform_addrspace -#endif +#define object_position_transform_auto object_position_transform +#define object_dir_transform_auto object_dir_transform +#define object_normal_transform_auto object_normal_transform CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h index 9c1768f05db..ce0fc15f196 100644 --- a/intern/cycles/kernel/geom/geom_patch.h +++ b/intern/cycles/kernel/geom/geom_patch.h @@ -24,6 +24,8 @@ * language governing permissions and limitations under the Apache License. */ +#pragma once + CCL_NAMESPACE_BEGIN typedef struct PatchHandle { @@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float * /* retrieve PatchHandle from patch coords */ ccl_device_inline PatchHandle -patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v) +patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v) { PatchHandle handle; @@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl /* retrieve patch control indices */ -ccl_device_inline int patch_eval_indices(KernelGlobals *kg, +ccl_device_inline int patch_eval_indices(const KernelGlobals *kg, const PatchHandle *handle, int channel, int indices[PATCH_MAX_CONTROL_VERTS]) @@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg, /* evaluate patch basis functions */ -ccl_device_inline void patch_eval_basis(KernelGlobals *kg, +ccl_device_inline void patch_eval_basis(const KernelGlobals *kg, const PatchHandle *handle, float u, float v, @@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg, /* generic function for evaluating indices and weights from patch coords */ -ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg, +ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg, int object, int patch, float u, @@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg, /* functions for evaluating attributes on patches */ -ccl_device float patch_eval_float(KernelGlobals *kg, +ccl_device float patch_eval_float(const KernelGlobals *kg, const ShaderData *sd, int offset, int patch, @@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, return val; } -ccl_device float2 patch_eval_float2(KernelGlobals *kg, +ccl_device float2 patch_eval_float2(const KernelGlobals *kg, const ShaderData *sd, int offset, int patch, @@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg, return val; } -ccl_device float3 patch_eval_float3(KernelGlobals *kg, +ccl_device float3 patch_eval_float3(const KernelGlobals *kg, const ShaderData *sd, int offset, int patch, @@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, return val; } -ccl_device float4 patch_eval_float4(KernelGlobals *kg, +ccl_device float4 patch_eval_float4(const KernelGlobals *kg, const ShaderData *sd, int offset, int patch, @@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg, return val; } -ccl_device float4 patch_eval_uchar4(KernelGlobals *kg, +ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg, const ShaderData *sd, int offset, int patch, diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index aeb044c9ad3..ba31b12e817 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -19,6 +19,10 @@ * Generic functions to look up mesh, curve and volume primitive attributes for * shading and render passes. */ +#pragma once + +#include "kernel/kernel_projection.h" + CCL_NAMESPACE_BEGIN /* Surface Attributes @@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN * attributes for performance, mainly for GPU performance to avoid bringing in * heavy volume interpolation code. */ -ccl_device_inline float primitive_surface_attribute_float( - KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) +ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float *dx, + float *dy) { if (sd->type & PRIMITIVE_ALL_TRIANGLE) { if (subd_triangle_patch(kg, sd) == ~0) @@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float( } } -ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg, +ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, @@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg, } } -ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg, +ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, @@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg, } } -ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg, - const ShaderData *sd, - const AttributeDescriptor desc, - float4 *dx, - float4 *dy) +ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float4 *dx, + float4 *dy) { if (sd->type & PRIMITIVE_ALL_TRIANGLE) { if (subd_triangle_patch(kg, sd) == ~0) @@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd, return sd->type == PRIMITIVE_VOLUME; } -ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg, +ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc) { @@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg, } } -ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg, +ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc) { @@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg, } } -ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg, +ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc) { @@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg, /* Default UV coordinate */ -ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd) { const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV); @@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd) /* Ptex coordinates */ -ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id) +ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id) { /* storing ptex data as attributes is not memory efficient but simple for tests */ const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID); @@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in /* Surface tangent */ -ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ if (sd->type & PRIMITIVE_ALL_CURVE) @@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) /* Motion vector for motion pass */ -ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd) { /* center position */ float3 center; diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h new file mode 100644 index 00000000000..fb2cb5cb1ea --- /dev/null +++ b/intern/cycles/kernel/geom/geom_shader_data.h @@ -0,0 +1,373 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Functions to initialize ShaderData given. + * + * Could be from an incoming ray, intersection or sampled position. */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* ShaderData setup from incoming ray */ + +#ifdef __OBJECT_MOTION__ +ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + float time) +{ + if (sd->object_flag & SD_OBJECT_MOTION) { + sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time); + sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion); + } +} +#endif + +/* TODO: break this up if it helps reduce register pressure to load data from + * global memory as we write it to shaderdata. */ +ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + const Ray *ccl_restrict ray, + const Intersection *ccl_restrict isect) +{ + /* Read intersection data into shader globals. + * + * TODO: this is redundant, could potentially remove some of this from + * ShaderData but would need to ensure that it also works for shadow + * shader evaluation. */ + sd->u = isect->u; + sd->v = isect->v; + sd->ray_length = isect->t; + sd->type = isect->type; + sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) : + isect->object; + sd->object_flag = kernel_tex_fetch(__object_flag, sd->object); + sd->prim = kernel_tex_fetch(__prim_index, isect->prim); + sd->lamp = LAMP_NONE; + sd->flag = 0; + + /* Read matrices and time. */ + sd->time = ray->time; + +#ifdef __OBJECT_MOTION__ + shader_setup_object_transforms(kg, sd, ray->time); +#endif + + /* Read ray data into shader globals. */ + sd->I = -ray->D; + +#ifdef __HAIR__ + if (sd->type & PRIMITIVE_ALL_CURVE) { + /* curve */ + curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim); + } + else +#endif + if (sd->type & PRIMITIVE_TRIANGLE) { + /* static triangle */ + float3 Ng = triangle_normal(kg, sd); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + + /* vectors */ + sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim); + sd->Ng = Ng; + sd->N = Ng; + + /* smooth normal */ + if (sd->shader & SHADER_SMOOTH_NORMAL) + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); + +#ifdef __DPDU__ + /* dPdu/dPdv */ + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); +#endif + } + else { + /* motion triangle */ + motion_triangle_shader_setup( + kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false); + } + + sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; + + if (isect->object != OBJECT_NONE) { + /* instance transform */ + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); +#ifdef __DPDU__ + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); +#endif + } + + /* backfacing test */ + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + + if (backfacing) { + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; +#ifdef __DPDU__ + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; +#endif + } + +#ifdef __RAY_DIFFERENTIALS__ + /* differentials */ + differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length); + differential_incoming_compact(&sd->dI, ray->D, ray->dD); + differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); +#endif +} + +/* ShaderData setup from position sampled on mesh */ + +ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + const float3 P, + const float3 Ng, + const float3 I, + int shader, + int object, + int prim, + float u, + float v, + float t, + float time, + bool object_space, + int lamp) +{ + /* vectors */ + sd->P = P; + sd->N = Ng; + sd->Ng = Ng; + sd->I = I; + sd->shader = shader; + if (prim != PRIM_NONE) + sd->type = PRIMITIVE_TRIANGLE; + else if (lamp != LAMP_NONE) + sd->type = PRIMITIVE_LAMP; + else + sd->type = PRIMITIVE_NONE; + + /* primitive */ + sd->object = object; + sd->lamp = LAMP_NONE; + /* Currently no access to bvh prim index for strand sd->prim. */ + sd->prim = prim; + sd->u = u; + sd->v = v; + sd->time = time; + sd->ray_length = t; + + sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; + sd->object_flag = 0; + if (sd->object != OBJECT_NONE) { + sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object); + +#ifdef __OBJECT_MOTION__ + shader_setup_object_transforms(kg, sd, time); +#endif + } + else if (lamp != LAMP_NONE) { + sd->lamp = lamp; + } + + /* transform into world space */ + if (object_space) { + object_position_transform_auto(kg, sd, &sd->P); + object_normal_transform_auto(kg, sd, &sd->Ng); + sd->N = sd->Ng; + object_dir_transform_auto(kg, sd, &sd->I); + } + + if (sd->type & PRIMITIVE_TRIANGLE) { + /* smooth normal */ + if (sd->shader & SHADER_SMOOTH_NORMAL) { + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); + + if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_normal_transform_auto(kg, sd, &sd->N); + } + } + + /* dPdu/dPdv */ +#ifdef __DPDU__ + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); + + if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); + } +#endif + } + else { +#ifdef __DPDU__ + sd->dPdu = zero_float3(); + sd->dPdv = zero_float3(); +#endif + } + + /* backfacing test */ + if (sd->prim != PRIM_NONE) { + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + + if (backfacing) { + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; +#ifdef __DPDU__ + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; +#endif + } + } + +#ifdef __RAY_DIFFERENTIALS__ + /* no ray differentials here yet */ + sd->dP = differential3_zero(); + sd->dI = differential3_zero(); + sd->du = differential_zero(); + sd->dv = differential_zero(); +#endif +} + +/* ShaderData setup for displacement */ + +ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + int object, + int prim, + float u, + float v) +{ + float3 P, Ng, I = zero_float3(); + int shader; + + triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); + + /* force smooth shading for displacement */ + shader |= SHADER_SMOOTH_NORMAL; + + shader_setup_from_sample( + kg, + sd, + P, + Ng, + I, + shader, + object, + prim, + u, + v, + 0.0f, + 0.5f, + !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED), + LAMP_NONE); +} + +/* ShaderData setup from ray into background */ + +ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + const float3 ray_P, + const float3 ray_D, + const float ray_time) +{ + /* for NDC coordinates */ + sd->ray_P = ray_P; + + /* vectors */ + sd->P = ray_D; + sd->N = -ray_D; + sd->Ng = -ray_D; + sd->I = -ray_D; + sd->shader = kernel_data.background.surface_shader; + sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; + sd->object_flag = 0; + sd->time = ray_time; + sd->ray_length = 0.0f; + + sd->object = OBJECT_NONE; + sd->lamp = LAMP_NONE; + sd->prim = PRIM_NONE; + sd->u = 0.0f; + sd->v = 0.0f; + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = zero_float3(); + sd->dPdv = zero_float3(); +#endif + +#ifdef __RAY_DIFFERENTIALS__ + /* differentials */ + sd->dP = differential3_zero(); /* TODO: ray->dP */ + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); +#endif +} + +/* ShaderData setup from point inside volume */ + +#ifdef __VOLUME__ +ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg, + ShaderData *ccl_restrict sd, + const Ray *ccl_restrict ray) +{ + + /* vectors */ + sd->P = ray->P; + sd->N = -ray->D; + sd->Ng = -ray->D; + sd->I = -ray->D; + sd->shader = SHADER_NONE; + sd->flag = 0; + sd->object_flag = 0; + sd->time = ray->time; + sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ + + sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */ + sd->lamp = LAMP_NONE; + sd->prim = PRIM_NONE; + sd->type = PRIMITIVE_VOLUME; + + sd->u = 0.0f; + sd->v = 0.0f; + +# ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = zero_float3(); + sd->dPdv = zero_float3(); +# endif + +# ifdef __RAY_DIFFERENTIALS__ + /* differentials */ + sd->dP = differential3_zero(); /* TODO ray->dD */ + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); +# endif + + /* for NDC coordinates */ + sd->ray_P = ray->P; + sd->ray_dP = ray->dP; +} +#endif /* __VOLUME__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 9eceb996926..877b2ece15b 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -16,18 +16,20 @@ /* Functions for retrieving attributes on triangles produced from subdivision meshes */ +#pragma once + CCL_NAMESPACE_BEGIN /* Patch index for triangle, -1 if not subdivision triangle */ -ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd) +ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd) { return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; } /* UV coords of triangle within patch */ -ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, +ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg, const ShaderData *sd, float2 uv[3]) { @@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, /* Vertex indices of patch */ -ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch) +ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch) { uint4 indices; @@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch /* Originating face for patch */ -ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch) +ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch) { return kernel_tex_fetch(__patches, patch + 4); } /* Number of corners on originating face */ -ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch) +ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch) { return kernel_tex_fetch(__patches, patch + 5) & 0xffff; } /* Indices of the four corners that are used by the patch */ -ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4]) +ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg, + int patch, + int corners[4]) { uint4 data; @@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, /* Reading attributes on various subdivision triangle elements */ -ccl_device_noinline float subd_triangle_attribute_float( - KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) +ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float *dx, + float *dy) { int patch = subd_triangle_patch(kg, sd); @@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float( } } -ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg, +ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, @@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg, } } -ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, +ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, @@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, } } -ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg, +ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float4 *dx, diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index ff7909ca425..910fb122c6d 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -20,10 +20,12 @@ * ray intersection we use a precomputed triangle storage to accelerate * intersection at the cost of more memory usage */ +#pragma once + CCL_NAMESPACE_BEGIN /* Normal on triangle. */ -ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) +ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); @@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) } /* Point and normal on triangle. */ -ccl_device_inline void triangle_point_normal( - KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader) +ccl_device_inline void triangle_point_normal(const KernelGlobals *kg, + int object, + int prim, + float u, + float v, + float3 *P, + float3 *Ng, + int *shader) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal( /* Triangle vertex locations */ -ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3]) +ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3]) { const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0)); @@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Triangle vertex locations and vertex normals */ -ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg, +ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg, int prim, float3 P[3], float3 N[3]) @@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg, /* Interpolate smooth vertex normal from vertices */ ccl_device_inline float3 -triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) +triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) } ccl_device_inline float3 triangle_smooth_normal_unnormalized( - KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v) + const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized( /* Ray differentials on triangle */ -ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, +ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv) @@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, /* Reading attributes on various triangle elements */ -ccl_device float triangle_attribute_float( - KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) +ccl_device float triangle_attribute_float(const KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float *dx, + float *dy) { if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) { float f0, f1, f2; @@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float( } } -ccl_device float2 triangle_attribute_float2(KernelGlobals *kg, +ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, @@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg, } } -ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, +ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, @@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, } } -ccl_device float4 triangle_attribute_float4(KernelGlobals *kg, +ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float4 *dx, diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index b0cce274b94..30b77ebd2eb 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -20,12 +20,17 @@ * intersection at the cost of more memory usage. */ +#pragma once + +#include "kernel/kernel_random.h" + CCL_NAMESPACE_BEGIN -ccl_device_inline bool triangle_intersect(KernelGlobals *kg, +ccl_device_inline bool triangle_intersect(const KernelGlobals *kg, Intersection *isect, float3 P, float3 dir, + float tmax, uint visibility, int object, int prim_addr) @@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, float t, u, v; if (ray_triangle_intersect(P, dir, - isect->t, + tmax, #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) ssef_verts, #else @@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, */ #ifdef __BVH_LOCAL__ -ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg, +ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg, LocalIntersection *local_isect, float3 P, float3 dir, @@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg, * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf */ -ccl_device_inline float3 triangle_refine(KernelGlobals *kg, +ccl_device_inline float3 triangle_refine(const KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, - const Ray *ray) + float3 P, + float3 D, + float t, + const int isect_object, + const int isect_prim) { - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - #ifdef __INTERSECTION_REFINE__ - if (isect->object != OBJECT_NONE) { + if (isect_object != OBJECT_NONE) { if (UNLIKELY(t == 0.0f)) { return P; } -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif + const Transform tfm = object_get_inverse_transform(kg, sd); P = transform_point(&tfm, P); D = transform_direction(&tfm, D * t); @@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D * t; - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim); const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2); @@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, P = P + D * rt; } - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_transform(kg, sd); P = transform_point(&tfm, P); } @@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, #endif } -/* Same as above, except that isect->t is assumed to be in object space for +/* Same as above, except that t is assumed to be in object space for * instancing. */ -ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg, +ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg, ShaderData *sd, - const Intersection *isect, - const Ray *ray) + float3 P, + float3 D, + float t, + const int isect_object, + const int isect_prim) { #ifdef __KERNEL_OPTIX__ - /* isect->t is always in world space with OptiX. */ - return triangle_refine(kg, sd, isect, ray); + /* t is always in world space with OptiX. */ + return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim); #else - float3 P = ray->P; - float3 D = ray->D; - float t = isect->t; - - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -# endif + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_inverse_transform(kg, sd); P = transform_point(&tfm, P); D = transform_direction(&tfm, D); @@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg, P = P + D * t; # ifdef __INTERSECTION_REFINE__ - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim); const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2); @@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg, } # endif /* __INTERSECTION_REFINE__ */ - if (isect->object != OBJECT_NONE) { -# ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -# else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -# endif - + if (isect_object != OBJECT_NONE) { + const Transform tfm = object_get_transform(kg, sd); P = transform_point(&tfm, P); } diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 809b76245ba..2bcd7e56b5f 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -23,13 +23,15 @@ * 3D voxel textures can be assigned as attributes per mesh, which means the * same shader can be used for volume objects with different densities, etc. */ +#pragma once + CCL_NAMESPACE_BEGIN #ifdef __VOLUME__ /* Return position normalized to 0..1 in mesh bounds */ -ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, +ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg, const ShaderData *sd, float3 P) { @@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value) } } -ccl_device float4 volume_attribute_float4(KernelGlobals *kg, +ccl_device float4 volume_attribute_float4(const KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc) { diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h new file mode 100644 index 00000000000..4898ff936c6 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h @@ -0,0 +1,181 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_adaptive_sampling.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_random.h" + +#include "kernel/geom/geom.h" + +CCL_NAMESPACE_BEGIN + +/* This helps with AA but it's not the real solution as it does not AA the geometry + * but it's better than nothing, thus committed. */ +ccl_device_inline float bake_clamp_mirror_repeat(float u, float max) +{ + /* use mirror repeat (like opengl texture) so that if the barycentric + * coordinate goes past the end of the triangle it is not always clamped + * to the same value, gives ugly patterns */ + u /= max; + float fu = floorf(u); + u = u - fu; + + return ((((int)fu) & 1) ? 1.0f - u : u) * max; +} + +/* Return false to indicate that this pixel is finished. + * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known + * that the pixel did converge. */ +ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS, + const ccl_global KernelWorkTile *ccl_restrict tile, + ccl_global float *render_buffer, + const int x, + const int y, + const int scheduled_sample) +{ + PROFILING_INIT(kg, PROFILING_RAY_SETUP); + + /* Initialize path state to give basic buffer access and allow early outputs. */ + path_state_init(INTEGRATOR_STATE_PASS, tile, x, y); + + /* Check whether the pixel has converged and should not be sampled anymore. */ + if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) { + return false; + } + + /* Always count the sample, even if the camera sample will reject the ray. */ + const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample); + + /* Setup render buffers. */ + const int index = INTEGRATOR_STATE(path, render_pixel_index); + const int pass_stride = kernel_data.film.pass_stride; + render_buffer += index * pass_stride; + + ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive; + ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential; + + const int seed = __float_as_uint(primitive[0]); + int prim = __float_as_uint(primitive[1]); + if (prim == -1) { + return false; + } + + prim += kernel_data.bake.tri_offset; + + /* Random number generator. */ + const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed; + + float filter_x, filter_y; + if (sample == 0) { + filter_x = filter_y = 0.5f; + } + else { + path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y); + } + + /* Initialize path state for path integration. */ + path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash); + + /* Barycentric UV with sub-pixel offset. */ + float u = primitive[2]; + float v = primitive[3]; + + float dudx = differential[0]; + float dudy = differential[1]; + float dvdx = differential[2]; + float dvdy = differential[3]; + + if (sample > 0) { + u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f); + v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f), + 1.0f - u); + } + + /* Position and normal on triangle. */ + float3 P, Ng; + int shader; + triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader); + shader &= SHADER_MASK; + + if (kernel_data.film.pass_background != PASS_UNUSED) { + /* Environment baking. */ + + /* Setup and write ray. */ + Ray ray ccl_optional_struct_init; + ray.P = zero_float3(); + ray.D = normalize(P); + ray.t = FLT_MAX; + ray.time = 0.5f; + ray.dP = differential_zero_compact(); + ray.dD = differential_zero_compact(); + integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Setup next kernel to execute. */ + INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + } + else { + /* Surface baking. */ + + /* Setup ray. */ + Ray ray ccl_optional_struct_init; + ray.P = P + Ng; + ray.D = -Ng; + ray.t = FLT_MAX; + ray.time = 0.5f; + + /* Setup differentials. */ + float3 dPdu, dPdv; + triangle_dPdudv(kg, prim, &dPdu, &dPdv); + differential3 dP; + dP.dx = dPdu * dudx + dPdv * dvdx; + dP.dy = dPdu * dudy + dPdv * dvdy; + ray.dP = differential_make_compact(dP); + ray.dD = differential_zero_compact(); + + /* Write ray. */ + integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Setup and write intersection. */ + Intersection isect ccl_optional_struct_init; + isect.object = kernel_data.bake.object_index; + isect.prim = prim; + isect.u = u; + isect.v = v; + isect.t = 1.0f; + isect.type = PRIMITIVE_TRIANGLE; +#ifdef __EMBREE__ + isect.Ng = Ng; +#endif + integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect); + + /* Setup next kernel to execute. */ + const int shader_flags = kernel_tex_fetch(__shaders, shader).flags; + if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + } + else { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + } + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h new file mode 100644 index 00000000000..58e7bde4c94 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_adaptive_sampling.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_shadow_catcher.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg, + const int sample, + const int x, + const int y, + const uint rng_hash, + Ray *ray) +{ + /* Filter sampling. */ + float filter_u, filter_v; + + if (sample == 0) { + filter_u = 0.5f; + filter_v = 0.5f; + } + else { + path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v); + } + + /* Depth of field sampling. */ + float lens_u = 0.0f, lens_v = 0.0f; + if (kernel_data.cam.aperturesize > 0.0f) { + path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v); + } + + /* Motion blur time sampling. */ + float time = 0.0f; +#ifdef __CAMERA_MOTION__ + if (kernel_data.cam.shuttertime != -1.0f) + time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME); +#endif + + /* Generate camera ray. */ + camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); +} + +/* Return false to indicate that this pixel is finished. + * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known + * that the pixel did converge. */ +ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS, + const ccl_global KernelWorkTile *ccl_restrict tile, + ccl_global float *render_buffer, + const int x, + const int y, + const int scheduled_sample) +{ + PROFILING_INIT(kg, PROFILING_RAY_SETUP); + + /* Initialize path state to give basic buffer access and allow early outputs. */ + path_state_init(INTEGRATOR_STATE_PASS, tile, x, y); + + /* Check whether the pixel has converged and should not be sampled anymore. */ + if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) { + return false; + } + + /* Count the sample and get an effective sample for this pixel. + * + * This logic allows to both count actual number of samples per pixel, and to add samples to this + * pixel after it was converged and samples were added somewhere else (in which case the + * `scheduled_sample` will be different from actual number of samples in this pixel). */ + const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample); + + /* Initialize random number seed for path. */ + const uint rng_hash = path_rng_hash_init(kg, sample, x, y); + + { + /* Generate camera ray. */ + Ray ray; + integrate_camera_sample(kg, sample, x, y, rng_hash, &ray); + if (ray.t == 0.0f) { + return true; + } + + /* Write camera ray to state. */ + integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray); + } + + /* Initialize path state for path integration. */ + path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash); + + /* Continue with intersect_closest kernel, optionally initializing volume + * stack before that if the camera may be inside a volume. */ + if (kernel_data.cam.is_inside_volume) { + INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); + } + else { + INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h new file mode 100644 index 00000000000..34ca6814534 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h @@ -0,0 +1,248 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_differential.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_shadow_catcher.h" + +#include "kernel/geom/geom.h" + +#include "kernel/bvh/bvh.h" + +CCL_NAMESPACE_BEGIN + +template<uint32_t current_kernel> +ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS, + const int shader_flags) +{ + + /* Optional AO bounce termination. + * We continue evaluating emissive/transparent surfaces and volumes, similar + * to direct lighting. Only if we know there are none can we terminate the + * path immediately. */ + if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) { + if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) { + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } + else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) { + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME; + } + else { + return true; + } + } + + /* Load random number state. */ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + + /* We perform path termination in this kernel to avoid launching shade_surface + * and evaluating the shader when not needed. Only for emission and transparent + * surfaces in front of emission do we need to evaluate the shader, since we + * perform MIS as part of indirect rays. */ + const int path_flag = INTEGRATOR_STATE(path, flag); + const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag); + + if (probability != 1.0f) { + const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE); + + if (probability == 0.0f || terminate >= probability) { + if (shader_flags & SD_HAS_EMISSION) { + /* Mark path to be terminated right after shader evaluation on the surface. */ + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE; + } + else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) { + /* TODO: only do this for emissive volumes. */ + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME; + } + else { + return true; + } + } + } + + return false; +} + +/* Note that current_kernel is a template value since making this a variable + * leads to poor performance with CUDA atomics. */ +template<uint32_t current_kernel> +ccl_device_forceinline void integrator_intersect_shader_next_kernel( + INTEGRATOR_STATE_ARGS, + const Intersection *ccl_restrict isect, + const int shader, + const int shader_flags) +{ + /* Note on scheduling. + * + * When there is no shadow catcher split the scheduling is simple: schedule surface shading with + * or without raytrace support, depending on the shader used. + * + * When there is a shadow catcher split the general idea is to have the following configuration: + * + * - Schedule surface shading kernel (with corresponding raytrace support) for the ray which + * will trace shadow catcher object. + * + * - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for + * the matte ray. + * + * - Otherwise schedule background shading kernel, so that we have a background to alpha-over + * on. The background kernel will then schedule surface shading for the matte ray. + * + * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for + * the matte path. */ + + const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) || + (kernel_data.film.pass_ao != PASS_UNUSED)); + + if (use_raytrace_kernel) { + INTEGRATOR_PATH_NEXT_SORTED( + current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + } + else { + INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + } + +#ifdef __SHADOW_CATCHER__ + const int object_flags = intersection_get_object_flags(kg, isect); + if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) { + if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) { + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND; + + if (use_raytrace_kernel) { + INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + } + else { + INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + } + } + else if (use_raytrace_kernel) { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + } + else { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + } + } +#endif +} + +ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS) +{ + PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST); + + /* Read ray from integrator state into local memory. */ + Ray ray ccl_optional_struct_init; + integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray); + kernel_assert(ray.t != 0.0f); + + const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS); + const int last_isect_prim = INTEGRATOR_STATE(isect, prim); + const int last_isect_object = INTEGRATOR_STATE(isect, object); + + /* Trick to use short AO rays to approximate indirect light at the end of the path. */ + if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) { + ray.t = kernel_data.integrator.ao_bounces_distance; + + const int last_object = last_isect_object != OBJECT_NONE ? + last_isect_object : + kernel_tex_fetch(__prim_object, last_isect_prim); + const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance; + if (object_ao_distance != 0.0f) { + ray.t = object_ao_distance; + } + } + + /* Scene Intersection. */ + Intersection isect ccl_optional_struct_init; + bool hit = scene_intersect(kg, &ray, visibility, &isect); + + /* TODO: remove this and do it in the various intersection functions instead. */ + if (!hit) { + isect.prim = PRIM_NONE; + } + + /* Light intersection for MIS. */ + if (kernel_data.integrator.use_lamp_mis) { + /* NOTE: if we make lights visible to camera rays, we'll need to initialize + * these in the path_state_init. */ + const int last_type = INTEGRATOR_STATE(isect, type); + const int path_flag = INTEGRATOR_STATE(path, flag); + + hit = lights_intersect( + kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) || + hit; + } + + /* Write intersection result into global integrator state memory. */ + integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect); + +#ifdef __VOLUME__ + if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) { + const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP); + const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE; + const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0; + + if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>( + INTEGRATOR_STATE_PASS, flags)) { + /* Continue with volume kernel if we are inside a volume, regardless + * if we hit anything. */ + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, + DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); + } + else { + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + } + return; + } +#endif + + if (hit) { + /* Hit a surface, continue with light or surface kernel. */ + if (isect.type & PRIMITIVE_LAMP) { + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, + DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + return; + } + else { + /* Hit a surface, continue with surface kernel unless terminated. */ + const int shader = intersection_get_shader(kg, &isect); + const int flags = kernel_tex_fetch(__shaders, shader).flags; + + if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>( + INTEGRATOR_STATE_PASS, flags)) { + integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>( + INTEGRATOR_STATE_PASS, &isect, shader, flags); + return; + } + else { + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + return; + } + } + } + else { + /* Nothing hit, continue with background kernel. */ + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, + DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + return; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h new file mode 100644 index 00000000000..5bd9cfda4a4 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Visibility for the shadow ray. */ +ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS) +{ + uint visibility = PATH_RAY_SHADOW; + +#ifdef __SHADOW_CATCHER__ + const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag); + visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility); +#endif + + return visibility; +} + +ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS, + const Ray *ray, + const uint visibility) +{ + /* Mask which will pick only opaque visibility bits from the `visibility`. + * Calculate the mask at compile time: the visibility will either be a high bits for the shadow + * catcher objects, or lower bits for the regular objects (there is no need to check the path + * state here again). */ + constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) | + PATH_RAY_SHADOW_OPAQUE; + + Intersection isect; + const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect); + + if (!opaque_hit) { + INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0; + } + + return opaque_hit; +} + +ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS) +{ + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce); + + return max(transparent_max_bounce - transparent_bounce - 1, 0); +} + +#ifdef __TRANSPARENT_SHADOWS__ +ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS, + const Ray *ray, + const uint visibility) +{ + Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE]; + + /* Limit the number hits to the max transparent bounces allowed and the size that we + * have available in the integrator state. */ + const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS); + const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE); + uint num_hits = 0; + bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits); + + /* If number of hits exceed the transparent bounces limit, make opaque. */ + if (num_hits > max_transparent_hits) { + opaque_hit = true; + } + + if (!opaque_hit) { + uint num_recorded_hits = min(num_hits, max_hits); + + if (num_recorded_hits > 0) { + sort_intersections(isect, num_recorded_hits); + + /* Write intersection result into global integrator state memory. */ + for (int hit = 0; hit < num_recorded_hits; hit++) { + integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit); + } + } + + INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits; + } + else { + INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0; + } + + return opaque_hit; +} +#endif + +ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS) +{ + PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW); + + /* Read ray from integrator state into local memory. */ + Ray ray ccl_optional_struct_init; + integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Compute visibility. */ + const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS); + +#ifdef __TRANSPARENT_SHADOWS__ + /* TODO: compile different kernels depending on this? Especially for OptiX + * conditional trace calls are bad. */ + const bool opaque_hit = + (kernel_data.integrator.transparent_shadows) ? + integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) : + integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility); +#else + const bool opaque_hit = integrate_intersect_shadow_opaque( + INTEGRATOR_STATE_PASS, &ray, visibility); +#endif + + if (opaque_hit) { + /* Hit an opaque surface, shadow path ends here. */ + INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + return; + } + else { + /* Hit nothing or transparent surfaces, continue to shadow kernel + * for shading and render buffer output. + * + * TODO: could also write to render buffer directly if no transparent shadows? + * Could save a kernel execution for the common case. */ + INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, + DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h index c10ecc426c6..7c090952dc7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl +++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2017 Blender Foundation + * Copyright 2011-2021 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,16 +14,23 @@ * limitations under the License. */ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" +#pragma once -__kernel void kernel_ocl_path_trace_state_buffer_size( - ccl_global char *kg, - ccl_constant KernelData *data, - uint num_threads, - ccl_global uint64_t *size) +#include "kernel/integrator/integrator_subsurface.h" + +CCL_NAMESPACE_BEGIN + +ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS) { - ((KernelGlobals*)kg)->data = data; - *size = split_data_buffer_size((KernelGlobals*)kg, num_threads); + PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE); + +#ifdef __SUBSURFACE__ + if (subsurface_scatter(INTEGRATOR_STATE_PASS)) { + return; + } +#endif + + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); } +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h new file mode 100644 index 00000000000..60d8a8e3e54 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h @@ -0,0 +1,198 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/bvh/bvh.h" +#include "kernel/geom/geom.h" +#include "kernel/integrator/integrator_volume_stack.h" +#include "kernel/kernel_shader.h" + +CCL_NAMESPACE_BEGIN + +ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS, + const float3 from_P, + const float3 to_P) +{ + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK); + + ShaderDataTinyStorage stack_sd_storage; + ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage); + + kernel_assert(kernel_data.integrator.use_volumes); + + Ray volume_ray ccl_optional_struct_init; + volume_ray.P = from_P; + volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t); + +#ifdef __VOLUME_RECORD_ALL__ + Intersection hits[2 * VOLUME_STACK_SIZE + 1]; + uint num_hits = scene_intersect_volume_all( + kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY); + if (num_hits > 0) { + Intersection *isect = hits; + + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); + + for (uint hit = 0; hit < num_hits; ++hit, ++isect) { + shader_setup_from_ray(kg, stack_sd, &volume_ray, isect); + volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd); + } + } +#else + Intersection isect; + int step = 0; + while (step < 2 * VOLUME_STACK_SIZE && + scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) { + shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect); + volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd); + + /* Move ray forward. */ + volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng); + if (volume_ray.t != FLT_MAX) { + volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t); + } + ++step; + } +#endif +} + +ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS) +{ + PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK); + + ShaderDataTinyStorage stack_sd_storage; + ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage); + + Ray volume_ray ccl_optional_struct_init; + integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray); + volume_ray.t = FLT_MAX; + + const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY); + int stack_index = 0, enclosed_index = 0; + + /* Write background shader. */ + if (kernel_data.background.volume_shader != SHADER_NONE) { + const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader}; + integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry); + stack_index++; + } + +#ifdef __VOLUME_RECORD_ALL__ + Intersection hits[2 * VOLUME_STACK_SIZE + 1]; + uint num_hits = scene_intersect_volume_all( + kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility); + if (num_hits > 0) { + int enclosed_volumes[VOLUME_STACK_SIZE]; + Intersection *isect = hits; + + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); + + for (uint hit = 0; hit < num_hits; ++hit, ++isect) { + shader_setup_from_ray(kg, stack_sd, &volume_ray, isect); + if (stack_sd->flag & SD_BACKFACING) { + bool need_add = true; + for (int i = 0; i < enclosed_index && need_add; ++i) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + if (enclosed_volumes[i] == stack_sd->object) { + need_add = false; + } + } + for (int i = 0; i < stack_index && need_add; ++i) { + /* Don't add intersections twice. */ + VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); + if (entry.object == stack_sd->object) { + need_add = false; + break; + } + } + if (need_add && stack_index < VOLUME_STACK_SIZE - 1) { + const VolumeStack new_entry = {stack_sd->object, stack_sd->shader}; + integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry); + ++stack_index; + } + } + else { + /* If ray from camera enters the volume, this volume shouldn't + * be added to the stack on exit. + */ + enclosed_volumes[enclosed_index++] = stack_sd->object; + } + } + } +#else + int enclosed_volumes[VOLUME_STACK_SIZE]; + int step = 0; + + while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 && + step < 2 * VOLUME_STACK_SIZE) { + Intersection isect; + if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) { + break; + } + + shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect); + if (stack_sd->flag & SD_BACKFACING) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + bool need_add = true; + for (int i = 0; i < enclosed_index && need_add; ++i) { + /* If ray exited the volume and never entered to that volume + * it means that camera is inside such a volume. + */ + if (enclosed_volumes[i] == stack_sd->object) { + need_add = false; + } + } + for (int i = 0; i < stack_index && need_add; ++i) { + /* Don't add intersections twice. */ + VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); + if (entry.object == stack_sd->object) { + need_add = false; + break; + } + } + if (need_add) { + const VolumeStack new_entry = {stack_sd->object, stack_sd->shader}; + integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry); + ++stack_index; + } + } + else { + /* If ray from camera enters the volume, this volume shouldn't + * be added to the stack on exit. + */ + enclosed_volumes[enclosed_index++] = stack_sd->object; + } + + /* Move ray forward. */ + volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng); + ++step; + } +#endif + + /* Write terminator. */ + const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE}; + integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry); + + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h new file mode 100644 index 00000000000..91363ea1c7f --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_megakernel.h @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_init_from_camera.h" +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_intersect_shadow.h" +#include "kernel/integrator/integrator_intersect_subsurface.h" +#include "kernel/integrator/integrator_intersect_volume_stack.h" +#include "kernel/integrator/integrator_shade_background.h" +#include "kernel/integrator/integrator_shade_light.h" +#include "kernel/integrator/integrator_shade_shadow.h" +#include "kernel/integrator/integrator_shade_surface.h" +#include "kernel/integrator/integrator_shade_volume.h" + +CCL_NAMESPACE_BEGIN + +ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + /* Each kernel indicates the next kernel to execute, so here we simply + * have to check what that kernel is and execute it. + * + * TODO: investigate if we can use device side enqueue for GPUs to avoid + * having to compile this big kernel. */ + while (true) { + if (INTEGRATOR_STATE(shadow_path, queued_kernel)) { + /* First handle any shadow paths before we potentially create more shadow paths. */ + switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + integrator_intersect_shadow(INTEGRATOR_STATE_PASS); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer); + break; + default: + kernel_assert(0); + break; + } + } + else if (INTEGRATOR_STATE(path, queued_kernel)) { + /* Then handle regular path kernels. */ + switch (INTEGRATOR_STATE(path, queued_kernel)) { + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + integrator_intersect_closest(INTEGRATOR_STATE_PASS); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer); + break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + integrator_intersect_subsurface(INTEGRATOR_STATE_PASS); + break; + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS); + break; + default: + kernel_assert(0); + break; + } + } + else { + break; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h new file mode 100644 index 00000000000..3e4cc837e9b --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_shade_background.h @@ -0,0 +1,215 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_shader.h" + +CCL_NAMESPACE_BEGIN + +ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ +#ifdef __BACKGROUND__ + const int shader = kernel_data.background.surface_shader; + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + + /* Use visibility flag to skip lights. */ + if (shader & SHADER_EXCLUDE_ANY) { + if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) || + ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) == + (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) || + ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) || + ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) || + ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER))) + return zero_float3(); + } + + /* Use fast constant background color if available. */ + float3 L = zero_float3(); + if (!shader_constant_emission_eval(kg, shader, &L)) { + /* Evaluate background shader. */ + + /* TODO: does aliasing like this break automatic SoA in CUDA? + * Should we instead store closures separate from ShaderData? */ + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + + PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP); + shader_setup_from_background(kg, + emission_sd, + INTEGRATOR_STATE(ray, P), + INTEGRATOR_STATE(ray, D), + INTEGRATOR_STATE(ray, time)); + + PROFILING_SHADER(emission_sd->object, emission_sd->shader); + PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL); + shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>( + INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION); + + L = shader_background_eval(emission_sd); + } + + /* Background MIS weights. */ +# ifdef __BACKGROUND_MIS__ + /* Check if background light exists or if we should skip pdf. */ + if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) { + const float3 ray_P = INTEGRATOR_STATE(ray, P); + const float3 ray_D = INTEGRATOR_STATE(ray, D); + const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf); + const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t); + + /* multiple importance sampling, get background light pdf for ray + * direction, and compute weight with respect to BSDF pdf */ + const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D); + const float mis_weight = power_heuristic(mis_ray_pdf, pdf); + + L *= mis_weight; + } +# endif + + return L; +#else + return make_float3(0.8f, 0.8f, 0.8f); +#endif +} + +ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + /* Accumulate transparency for transparent background. We can skip background + * shader evaluation unless a background pass is used. */ + bool eval_background = true; + float transparent = 0.0f; + + const bool is_transparent_background_ray = kernel_data.background.transparent && + (INTEGRATOR_STATE(path, flag) & + PATH_RAY_TRANSPARENT_BACKGROUND); + + if (is_transparent_background_ray) { + transparent = average(INTEGRATOR_STATE(path, throughput)); + +#ifdef __PASSES__ + eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)); +#else + eval_background = false; +#endif + } + + /* Evaluate background shader. */ + float3 L = (eval_background) ? + integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) : + zero_float3(); + + /* When using the ao bounces approximation, adjust background + * shader intensity with ao factor. */ + if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) { + L *= kernel_data.integrator.ao_bounces_factor; + } + + /* Write to render buffer. */ + kernel_accum_background( + INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer); +} + +ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + const float3 ray_D = INTEGRATOR_STATE(ray, D); + const float ray_time = INTEGRATOR_STATE(ray, time); + LightSample ls ccl_optional_struct_init; + for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) { + if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) { + /* Use visibility flag to skip lights. */ +#ifdef __PASSES__ + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + + if (ls.shader & SHADER_EXCLUDE_ANY) { + if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) || + ((ls.shader & SHADER_EXCLUDE_GLOSSY) && + ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) == + (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) || + ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) || + ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) || + ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER))) + return; + } +#endif + + /* Evaluate light shader. */ + /* TODO: does aliasing like this break automatic SoA in CUDA? */ + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + float3 light_eval = light_sample_shader_eval( + INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time); + if (is_zero(light_eval)) { + return; + } + + /* MIS weighting. */ + if (!(path_flag & PATH_RAY_MIS_SKIP)) { + /* multiple importance sampling, get regular light pdf, + * and compute weight with respect to BSDF pdf */ + const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf); + const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf); + light_eval *= mis_weight; + } + + /* Write to render buffer. */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer); + } + } +} + +ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP); + + /* TODO: unify these in a single loop to only have a single shader evaluation call. */ + integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer); + integrate_background(INTEGRATOR_STATE_PASS, render_buffer); + +#ifdef __SHADOW_CATCHER__ + if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) { + INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND; + + const int isect_prim = INTEGRATOR_STATE(isect, prim); + const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim); + const int shader_flags = kernel_tex_fetch(__shaders, shader).flags; + + if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, + shader); + } + else { + INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, + shader); + } + return; + } +#endif + + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h new file mode 100644 index 00000000000..05b530f9665 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_shade_light.h @@ -0,0 +1,126 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_shader.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + /* Setup light sample. */ + Intersection isect ccl_optional_struct_init; + integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect); + + float3 ray_P = INTEGRATOR_STATE(ray, P); + const float3 ray_D = INTEGRATOR_STATE(ray, D); + const float ray_time = INTEGRATOR_STATE(ray, time); + + /* Advance ray beyond light. */ + /* TODO: can we make this more numerically robust to avoid reintersecting the + * same light in some cases? */ + const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D); + INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P; + INTEGRATOR_STATE_WRITE(ray, t) -= isect.t; + + /* Set position to where the BSDF was sampled, for correct MIS PDF. */ + const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t); + ray_P -= ray_D * mis_ray_t; + isect.t += mis_ray_t; + INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t; + + LightSample ls ccl_optional_struct_init; + const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls); + + if (!use_light_sample) { + return; + } + + /* Use visibility flag to skip lights. */ +#ifdef __PASSES__ + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + + if (ls.shader & SHADER_EXCLUDE_ANY) { + if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) || + ((ls.shader & SHADER_EXCLUDE_GLOSSY) && + ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) == + (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) || + ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) || + ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER))) + return; + } +#endif + + /* Evaluate light shader. */ + /* TODO: does aliasing like this break automatic SoA in CUDA? */ + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time); + if (is_zero(light_eval)) { + return; + } + + /* MIS weighting. */ + if (!(path_flag & PATH_RAY_MIS_SKIP)) { + /* multiple importance sampling, get regular light pdf, + * and compute weight with respect to BSDF pdf */ + const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf); + const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf); + light_eval *= mis_weight; + } + + /* Write to render buffer. */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer); +} + +ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP); + + integrate_light(INTEGRATOR_STATE_PASS, render_buffer); + + /* TODO: we could get stuck in an infinite loop if there are precision issues + * and the same light is hit again. + * + * As a workaround count this as a transparent bounce. It makes some sense + * to interpret lights as transparent surfaces (and support making them opaque), + * but this needs to be revisited. */ + uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce; + + if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + return; + } + else { + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + return; + } + + /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but + * probably that optimization is probably not practical if we add lights to + * scene geometry. */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h new file mode 100644 index 00000000000..fd3c3ae1653 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h @@ -0,0 +1,182 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_shade_volume.h" +#include "kernel/integrator/integrator_volume_stack.h" + +#include "kernel/kernel_shader.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits) +{ + return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE; +} + +#ifdef __TRANSPARENT_SHADOWS__ +ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit) +{ + PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE); + + /* TODO: does aliasing like this break automatic SoA in CUDA? + * Should we instead store closures separate from ShaderData? + * + * TODO: is it better to declare this outside the loop or keep it local + * so the compiler can see there is no dependency between iterations? */ + ShaderDataTinyStorage shadow_sd_storage; + ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage); + + /* Setup shader data at surface. */ + Intersection isect ccl_optional_struct_init; + integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit); + + Ray ray ccl_optional_struct_init; + integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray); + + shader_setup_from_ray(kg, shadow_sd, &ray, &isect); + + /* Evaluate shader. */ + if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { + shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>( + INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW); + } + +# ifdef __VOLUME__ + /* Exit/enter volume. */ + shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd); +# endif + + /* Compute transparency from closures. */ + return shader_bsdf_transparency(kg, shadow_sd); +} + +# ifdef __VOLUME__ +ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS, + const int hit, + const int num_recorded_hits, + float3 *ccl_restrict throughput) +{ + PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME); + + /* TODO: deduplicate with surface, or does it not matter for memory usage? */ + ShaderDataTinyStorage shadow_sd_storage; + ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage); + + /* Setup shader data. */ + Ray ray ccl_optional_struct_init; + integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Modify ray position and length to match current segment. */ + const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t); + const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) : + ray.t; + ray.P += start_t * ray.D; + ray.t = end_t - start_t; + + shader_setup_from_volume(kg, shadow_sd, &ray); + + const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) { + return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i); + }); + + volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size); +} +# endif + +ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits) +{ + /* Accumulate shadow for transparent surfaces. */ + const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + + for (int hit = 0; hit < num_recorded_hits + 1; hit++) { + /* Volume shaders. */ + if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) { +# ifdef __VOLUME__ + if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) { + float3 throughput = INTEGRATOR_STATE(shadow_path, throughput); + integrate_transparent_volume_shadow( + INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput); + if (is_zero(throughput)) { + return true; + } + + INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput; + } +# endif + } + + /* Surface shaders. */ + if (hit < num_recorded_hits) { + const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit); + const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow; + if (is_zero(throughput)) { + return true; + } + + INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput; + INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1; + } + + /* Note we do not need to check max_transparent_bounce here, the number + * of intersections is already limited and made opaque in the + * INTERSECT_SHADOW kernel. */ + } + + if (shadow_intersections_has_remaining(num_hits)) { + /* There are more hits that we could not recorded due to memory usage, + * adjust ray to intersect again from the last hit. */ + const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t); + const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P); + const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D); + INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D); + INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t; + } + + return false; +} +#endif /* __TRANSPARENT_SHADOWS__ */ + +ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP); + const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits); + +#ifdef __TRANSPARENT_SHADOWS__ + /* Evaluate transparent shadows. */ + const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits); + if (opaque) { + INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return; + } +#endif + + if (shadow_intersections_has_remaining(num_hits)) { + /* More intersections to find, continue shadow ray. */ + INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + return; + } + else { + kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer); + INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + return; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h new file mode 100644 index 00000000000..73b7cad32be --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h @@ -0,0 +1,502 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shader.h" + +#include "kernel/integrator/integrator_subsurface.h" +#include "kernel/integrator/integrator_volume_stack.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *sd) +{ + Intersection isect ccl_optional_struct_init; + integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect); + + Ray ray ccl_optional_struct_init; + integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray); + + shader_setup_from_ray(kg, sd, &ray, &isect); +} + +#ifdef __HOLDOUT__ +ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *sd, + ccl_global float *ccl_restrict render_buffer) +{ + /* Write holdout transparency to render buffer and stop if fully holdout. */ + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + + if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && + (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) { + const float3 holdout_weight = shader_holdout_apply(kg, sd); + if (kernel_data.background.transparent) { + const float3 throughput = INTEGRATOR_STATE(path, throughput); + const float transparent = average(holdout_weight * throughput); + kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer); + } + if (isequal_float3(holdout_weight, one_float3())) { + return false; + } + } + + return true; +} +#endif /* __HOLDOUT__ */ + +#ifdef __EMISSION__ +ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS, + const ShaderData *sd, + ccl_global float *ccl_restrict + render_buffer) +{ + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + + /* Evaluate emissive closure. */ + float3 L = shader_emissive_eval(sd); + +# ifdef __HAIR__ + if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && + (sd->type & PRIMITIVE_ALL_TRIANGLE)) +# else + if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) +# endif + { + const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf); + const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t); + + /* Multiple importance sampling, get triangle light pdf, + * and compute weight with respect to BSDF pdf. */ + float pdf = triangle_light_pdf(kg, sd, t); + float mis_weight = power_heuristic(bsdf_pdf, pdf); + + L *= mis_weight; + } + + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer); +} +#endif /* __EMISSION__ */ + +#ifdef __EMISSION__ +/* Path tracing: sample point on light and evaluate light shader, then + * queue shadow ray to be traced. */ +ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS, + ShaderData *sd, + const RNGState *rng_state) +{ + /* Test if there is a light or BSDF that needs direct light. */ + if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) { + return; + } + + /* Sample position on a light. */ + LightSample ls ccl_optional_struct_init; + { + const int path_flag = INTEGRATOR_STATE(path, flag); + const uint bounce = INTEGRATOR_STATE(path, bounce); + float light_u, light_v; + path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v); + + if (!light_distribution_sample_from_position( + kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) { + return; + } + } + + kernel_assert(ls.pdf != 0.0f); + + /* Evaluate light shader. + * + * TODO: can we reuse sd memory? In theory we can move this after + * integrate_surface_bounce, evaluate the BSDF, and only then evaluate + * the light shader. This could also move to its own kernel, for + * non-constant light sources. */ + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + const float3 light_eval = light_sample_shader_eval( + INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time); + if (is_zero(light_eval)) { + return; + } + + /* Evaluate BSDF. */ + const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D); + + BsdfEval bsdf_eval ccl_optional_struct_init; + const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader); + bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf); + + if (ls.shader & SHADER_USE_MIS) { + const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf); + bsdf_eval_mul(&bsdf_eval, mis_weight); + } + + /* Path termination. */ + const float terminate = path_state_rng_light_termination(kg, rng_state); + if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) { + return; + } + + /* Create shadow ray. */ + Ray ray ccl_optional_struct_init; + light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray); + const bool is_light = light_sample_is_light(&ls); + + /* Copy volume stack and enter/exit volume. */ + integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS); + + if (is_transmission) { +# ifdef __VOLUME__ + shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd); +# endif + } + + /* Write shadow ray and associated state to global memory. */ + integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Copy state from main path to shadow path. */ + const uint16_t bounce = INTEGRATOR_STATE(path, bounce); + const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce); + uint32_t shadow_flag = INTEGRATOR_STATE(path, flag); + shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0; + shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS; + const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval); + + if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { + const float3 diffuse_glossy_ratio = (bounce == 0) ? + bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) : + INTEGRATOR_STATE(path, diffuse_glossy_ratio); + INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio; + } + + INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag; + INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce; + INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce; + INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput; + + if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) { + INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput; + } + + /* Branch off shadow kernel. */ + INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); +} +#endif + +/* Path tracing: bounce off or through surface with new direction. */ +ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS, + ShaderData *sd, + const RNGState *rng_state) +{ + /* Sample BSDF or BSSRDF. */ + if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) { + return LABEL_NONE; + } + + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u); + +#ifdef __SUBSURFACE__ + /* BSSRDF closure, we schedule subsurface intersection kernel. */ + if (CLOSURE_IS_BSSRDF(sc->type)) { + return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc); + } +#endif + + /* BSDF closure, sample direction. */ + float bsdf_pdf; + BsdfEval bsdf_eval ccl_optional_struct_init; + float3 bsdf_omega_in ccl_optional_struct_init; + differential3 bsdf_domega_in ccl_optional_struct_init; + int label; + + label = shader_bsdf_sample_closure( + kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); + + if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) { + return LABEL_NONE; + } + + /* Setup ray. Note that clipping works through transparent bounces. */ + INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng); + INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in); + INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ? + INTEGRATOR_STATE(ray, t) - sd->ray_length : + FLT_MAX; + +#ifdef __RAY_DIFFERENTIALS__ + INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP); + INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in); +#endif + + /* Update throughput. */ + float3 throughput = INTEGRATOR_STATE(path, throughput); + throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf; + INTEGRATOR_STATE_WRITE(path, throughput) = throughput; + + if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { + if (INTEGRATOR_STATE(path, bounce) == 0) { + INTEGRATOR_STATE_WRITE(path, + diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval); + } + } + + /* Update path state */ + if (label & LABEL_TRANSPARENT) { + INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length; + } + else { + INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf; + INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f; + INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf, + INTEGRATOR_STATE(path, min_ray_pdf)); + } + + path_state_next(INTEGRATOR_STATE_PASS, label); + return label; +} + +#ifdef __VOLUME__ +ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS, + ShaderData *sd) +{ + if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) { + return LABEL_NONE; + } + + /* Setup ray position, direction stays unchanged. */ + INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng); + + /* Clipping works through transparent. */ + INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length; + +# ifdef __RAY_DIFFERENTIALS__ + INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP); +# endif + + INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length; + + return LABEL_TRANSMIT | LABEL_TRANSPARENT; +} +#endif + +#if defined(__AO__) && defined(__SHADER_RAYTRACE__) +ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS, + const ShaderData *ccl_restrict sd, + const RNGState *ccl_restrict rng_state, + ccl_global float *ccl_restrict render_buffer) +{ +# ifdef __KERNEL_OPTIX__ + optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer); +} + +extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS, + const ShaderData *ccl_restrict sd, + const RNGState *ccl_restrict rng_state, + ccl_global float *ccl_restrict render_buffer) +{ +# endif /* __KERNEL_OPTIX__ */ + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + const float3 ao_N = shader_bsdf_ao_normal(kg, sd); + float3 ao_D; + float ao_pdf; + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray ray ccl_optional_struct_init; + ray.P = ray_offset(sd->P, sd->Ng); + ray.D = ao_D; + ray.t = kernel_data.integrator.ao_bounces_distance; + ray.time = sd->time; + ray.dP = differential_zero_compact(); + ray.dD = differential_zero_compact(); + + Intersection isect ccl_optional_struct_init; + if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) { + ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput); + } + } +} +#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */ + +template<uint node_feature_mask> +ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) + +{ + PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP); + + /* Setup shader data. */ + ShaderData sd; + integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd); + PROFILING_SHADER(sd.object, sd.shader); + + int continue_path_label = 0; + + /* Skip most work for volume bounding surface. */ +#ifdef __VOLUME__ + if (!(sd.flag & SD_HAS_ONLY_VOLUME)) { +#endif + + { + const int path_flag = INTEGRATOR_STATE(path, flag); +#ifdef __SUBSURFACE__ + /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */ + if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP))) +#endif + { + /* Evaluate shader. */ + PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL); + shader_eval_surface<node_feature_mask>( + INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag); + } + } + +#ifdef __SUBSURFACE__ + if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) { + /* When coming from inside subsurface scattering, setup a diffuse + * closure to perform lighting at the exit point. */ + INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE; + subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd); + } +#endif + + shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd); + +#ifdef __HOLDOUT__ + /* Evaluate holdout. */ + if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) { + return false; + } +#endif + +#ifdef __EMISSION__ + /* Write emission. */ + if (sd.flag & SD_EMISSION) { + integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer); + } +#endif + +#ifdef __PASSES__ + /* Write render passes. */ + PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES); + kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer); +#endif + + /* Load random number state. */ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + + /* Perform path termination. Most paths have already been terminated in + * the intersect_closest kernel, this is just for emission and for dividing + * throughput by the probability at the right moment. */ + const int path_flag = INTEGRATOR_STATE(path, flag); + const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ? + 0.0f : + path_state_continuation_probability(INTEGRATOR_STATE_PASS, + path_flag); + if (probability == 0.0f) { + return false; + } + else if (probability != 1.0f) { + INTEGRATOR_STATE_WRITE(path, throughput) /= probability; + } + +#ifdef __DENOISING_FEATURES__ + kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer); +#endif + +#ifdef __SHADOW_CATCHER__ + kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer); +#endif + + /* Direct light. */ + PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT); + integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state); + +#if defined(__AO__) && defined(__SHADER_RAYTRACE__) + /* Ambient occlusion pass. */ + if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) { + if ((kernel_data.film.pass_ao != PASS_UNUSED) && + (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) { + PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO); + integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer); + } + } +#endif + + PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT); + continue_path_label = integrate_surface_bsdf_bssrdf_bounce( + INTEGRATOR_STATE_PASS, &sd, &rng_state); +#ifdef __VOLUME__ + } + else { + PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT); + continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd); + } + + if (continue_path_label & LABEL_TRANSMIT) { + /* Enter/Exit volume. */ + volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd); + } +#endif + + return continue_path_label != 0; +} + +template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE, + int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE> +ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) { + if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) { + INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); + } + else { + kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f); + INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + } + } + else { + INTEGRATOR_PATH_TERMINATE(current_kernel); + } +} + +ccl_device_forceinline void integrator_shade_surface_raytrace( + INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer) +{ + integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS, + render_buffer); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h new file mode 100644 index 00000000000..4a864b1e6ce --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h @@ -0,0 +1,1015 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shader.h" + +#include "kernel/integrator/integrator_intersect_closest.h" +#include "kernel/integrator/integrator_volume_stack.h" + +CCL_NAMESPACE_BEGIN + +#ifdef __VOLUME__ + +/* Events for probalistic scattering */ + +typedef enum VolumeIntegrateEvent { + VOLUME_PATH_SCATTERED = 0, + VOLUME_PATH_ATTENUATED = 1, + VOLUME_PATH_MISSED = 2 +} VolumeIntegrateEvent; + +typedef struct VolumeIntegrateResult { + /* Throughput and offset for direct light scattering. */ + bool direct_scatter; + float3 direct_throughput; + float direct_t; + ShaderVolumePhases direct_phases; + + /* Throughput and offset for indirect light scattering. */ + bool indirect_scatter; + float3 indirect_throughput; + float indirect_t; + ShaderVolumePhases indirect_phases; +} VolumeIntegrateResult; + +/* Ignore paths that have volume throughput below this value, to avoid unnecessary work + * and precision issues. + * todo: this value could be tweaked or turned into a probability to avoid unnecessary + * work in volumes and subsurface scattering. */ +# define VOLUME_THROUGHPUT_EPSILON 1e-6f + +/* Volume shader properties + * + * extinction coefficient = absorption coefficient + scattering coefficient + * sigma_t = sigma_a + sigma_s */ + +typedef struct VolumeShaderCoefficients { + float3 sigma_t; + float3 sigma_s; + float3 emission; +} VolumeShaderCoefficients; + +/* Evaluate shader to get extinction coefficient at P. */ +ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS, + ShaderData *ccl_restrict sd, + float3 *ccl_restrict extinction) +{ + shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) { + return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i); + }); + + if (!(sd->flag & SD_EXTINCTION)) { + return false; + } + + const float density = object_volume_density(kg, sd->object); + *extinction = sd->closure_transparent_extinction * density; + return true; +} + +/* Evaluate shader to get absorption, scattering and emission at P. */ +ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS, + ShaderData *ccl_restrict sd, + VolumeShaderCoefficients *coeff) +{ + const int path_flag = INTEGRATOR_STATE(path, flag); + shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) { + return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); + }); + + if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) { + return false; + } + + coeff->sigma_s = zero_float3(); + coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3(); + coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3(); + + if (sd->flag & SD_SCATTER) { + for (int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if (CLOSURE_IS_VOLUME(sc->type)) { + coeff->sigma_s += sc->weight; + } + } + } + + const float density = object_volume_density(kg, sd->object); + coeff->sigma_s *= density; + coeff->sigma_t *= density; + coeff->emission *= density; + + return true; +} + +ccl_device_forceinline void volume_step_init(const KernelGlobals *kg, + const RNGState *rng_state, + const float object_step_size, + float t, + float *step_size, + float *step_shade_offset, + float *steps_offset, + int *max_steps) +{ + if (object_step_size == FLT_MAX) { + /* Homogeneous volume. */ + *step_size = t; + *step_shade_offset = 0.0f; + *steps_offset = 1.0f; + *max_steps = 1; + } + else { + /* Heterogeneous volume. */ + *max_steps = kernel_data.integrator.volume_max_steps; + float step = min(object_step_size, t); + + /* compute exact steps in advance for malloc */ + if (t > *max_steps * step) { + step = t / (float)*max_steps; + } + + *step_size = step; + + /* Perform shading at this offset within a step, to integrate over + * over the entire step segment. */ + *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4); + + /* Shift starting point of all segment by this random amount to avoid + * banding artifacts from the volume bounding shape. */ + *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3); + } +} + +/* Volume Shadows + * + * These functions are used to attenuate shadow rays to lights. Both absorption + * and scattering will block light, represented by the extinction coefficient. */ + +# if 0 +/* homogeneous volume: assume shader evaluation at the starts gives + * the extinction coefficient for the entire line segment */ +ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS, + Ray *ccl_restrict ray, + ShaderData *ccl_restrict sd, + float3 *ccl_restrict throughput) +{ + float3 sigma_t = zero_float3(); + + if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) { + *throughput *= volume_color_transmittance(sigma_t, ray->t); + } +} +# endif + +/* heterogeneous volume: integrate stepping through the volume until we + * reach the end, get absorbed entirely, or run out of iterations */ +ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS, + Ray *ccl_restrict ray, + ShaderData *ccl_restrict sd, + float3 *ccl_restrict throughput, + const float object_step_size) +{ + /* Load random number state. */ + RNGState rng_state; + shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + + float3 tp = *throughput; + + /* Prepare for stepping. + * For shadows we do not offset all segments, since the starting point is + * already a random distance inside the volume. It also appears to create + * banding artifacts for unknown reasons. */ + int max_steps; + float step_size, step_shade_offset, unused; + volume_step_init(kg, + &rng_state, + object_step_size, + ray->t, + &step_size, + &step_shade_offset, + &unused, + &max_steps); + const float steps_offset = 1.0f; + + /* compute extinction at the start */ + float t = 0.0f; + + float3 sum = zero_float3(); + + for (int i = 0; i < max_steps; i++) { + /* advance to new position */ + float new_t = min(ray->t, (i + steps_offset) * step_size); + float dt = new_t - t; + + float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset); + float3 sigma_t = zero_float3(); + + /* compute attenuation over segment */ + sd->P = new_P; + if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) { + /* Compute expf() only for every Nth step, to save some calculations + * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON + * check then. */ + sum += (-sigma_t * dt); + if ((i & 0x07) == 0) { /* ToDo: Other interval? */ + tp = *throughput * exp3(sum); + + /* stop if nearly all light is blocked */ + if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON && + tp.z < VOLUME_THROUGHPUT_EPSILON) + break; + } + } + + /* stop if at the end of the volume */ + t = new_t; + if (t == ray->t) { + /* Update throughput in case we haven't done it above */ + tp = *throughput * exp3(sum); + break; + } + } + + *throughput = tp; +} + +/* Equi-angular sampling as in: + * "Importance Sampling Techniques for Path Tracing in Participating Media" */ + +ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray, + const float3 light_P, + const float xi, + float *pdf) +{ + const float t = ray->t; + const float delta = dot((light_P - ray->P), ray->D); + const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); + if (UNLIKELY(D == 0.0f)) { + *pdf = 0.0f; + return 0.0f; + } + const float theta_a = -atan2f(delta, D); + const float theta_b = atan2f(t - delta, D); + const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a); + if (UNLIKELY(theta_b == theta_a)) { + *pdf = 0.0f; + return 0.0f; + } + *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); + + return min(t, delta + t_); /* min is only for float precision errors */ +} + +ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray, + const float3 light_P, + const float sample_t) +{ + const float delta = dot((light_P - ray->P), ray->D); + const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); + if (UNLIKELY(D == 0.0f)) { + return 0.0f; + } + + const float t = ray->t; + const float t_ = sample_t - delta; + + const float theta_a = -atan2f(delta, D); + const float theta_b = atan2f(t - delta, D); + if (UNLIKELY(theta_b == theta_a)) { + return 0.0f; + } + + const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); + + return pdf; +} + +ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray, + const float3 light_P, + const float sample_t) +{ + float delta = dot((light_P - ray->P), ray->D); + float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); + if (UNLIKELY(D == 0.0f)) { + return 0.0f; + } + + const float t = ray->t; + const float t_ = sample_t - delta; + + const float theta_a = -atan2f(delta, D); + const float theta_b = atan2f(t - delta, D); + if (UNLIKELY(theta_b == theta_a)) { + return 0.0f; + } + + const float theta_sample = atan2f(t_, D); + const float cdf = (theta_sample - theta_a) / (theta_b - theta_a); + + return cdf; +} + +/* Distance sampling */ + +ccl_device float volume_distance_sample( + float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf) +{ + /* xi is [0, 1[ so log(0) should never happen, division by zero is + * avoided because sample_sigma_t > 0 when SD_SCATTER is set */ + float sample_sigma_t = volume_channel_get(sigma_t, channel); + float3 full_transmittance = volume_color_transmittance(sigma_t, max_t); + float sample_transmittance = volume_channel_get(full_transmittance, channel); + + float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t); + + *transmittance = volume_color_transmittance(sigma_t, sample_t); + *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance); + + /* todo: optimization: when taken together with hit/miss decision, + * the full_transmittance cancels out drops out and xi does not + * need to be remapped */ + + return sample_t; +} + +ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t) +{ + float3 full_transmittance = volume_color_transmittance(sigma_t, max_t); + float3 transmittance = volume_color_transmittance(sigma_t, sample_t); + + return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance); +} + +/* Emission */ + +ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff, + int closure_flag, + float3 transmittance, + float t) +{ + /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t + * this goes to E * t as sigma_t goes to zero + * + * todo: we should use an epsilon to avoid precision issues near zero sigma_t */ + float3 emission = coeff->emission; + + if (closure_flag & SD_EXTINCTION) { + float3 sigma_t = coeff->sigma_t; + + emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t; + emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t; + emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t; + } + else + emission *= t; + + return emission; +} + +/* Volume Integration */ + +typedef struct VolumeIntegrateState { + /* Volume segment extents. */ + float start_t; + float end_t; + + /* If volume is absorption-only up to this point, and no probabilistic + * scattering or termination has been used yet. */ + bool absorption_only; + + /* Random numbers for scattering. */ + float rscatter; + float rphase; + + /* Multiple importance sampling. */ + VolumeSampleMethod direct_sample_method; + bool use_mis; + float distance_pdf; + float equiangular_pdf; +} VolumeIntegrateState; + +ccl_device_forceinline void volume_integrate_step_scattering( + const ShaderData *sd, + const Ray *ray, + const float3 equiangular_light_P, + const VolumeShaderCoefficients &ccl_restrict coeff, + const float3 transmittance, + VolumeIntegrateState &ccl_restrict vstate, + VolumeIntegrateResult &ccl_restrict result) +{ + /* Pick random color channel, we use the Veach one-sample + * model with balance heuristic for the channels. */ + const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); + float3 channel_pdf; + const int channel = volume_sample_channel( + albedo, result.indirect_throughput, vstate.rphase, &channel_pdf); + + /* Equiangular sampling for direct lighting. */ + if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) { + if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) { + const float new_dt = result.direct_t - vstate.start_t; + const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); + + result.direct_scatter = true; + result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf; + shader_copy_volume_phases(&result.direct_phases, sd); + + /* Multiple importance sampling. */ + if (vstate.use_mis) { + const float distance_pdf = vstate.distance_pdf * + dot(channel_pdf, coeff.sigma_t * new_transmittance); + const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf); + result.direct_throughput *= mis_weight; + } + } + else { + result.direct_throughput *= transmittance; + vstate.distance_pdf *= dot(channel_pdf, transmittance); + } + } + + /* Distance sampling for indirect and optional direct lighting. */ + if (!result.indirect_scatter) { + /* decide if we will scatter or continue */ + const float sample_transmittance = volume_channel_get(transmittance, channel); + + if (1.0f - vstate.rscatter >= sample_transmittance) { + /* compute sampling distance */ + const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel); + const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t; + const float new_t = vstate.start_t + new_dt; + + /* transmittance and pdf */ + const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); + const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance); + + /* throughput */ + result.indirect_scatter = true; + result.indirect_t = new_t; + result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf; + shader_copy_volume_phases(&result.indirect_phases, sd); + + if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) { + /* If using distance sampling for direct light, just copy parameters + * of indirect light since we scatter at the same point then. */ + result.direct_scatter = true; + result.direct_t = result.indirect_t; + result.direct_throughput = result.indirect_throughput; + shader_copy_volume_phases(&result.direct_phases, sd); + + /* Multiple importance sampling. */ + if (vstate.use_mis) { + const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t); + const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf, + equiangular_pdf); + result.direct_throughput *= 2.0f * mis_weight; + } + } + } + else { + /* throughput */ + const float pdf = dot(channel_pdf, transmittance); + result.indirect_throughput *= transmittance / pdf; + if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) { + vstate.distance_pdf *= pdf; + } + + /* remap rscatter so we can reuse it and keep thing stratified */ + vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance; + } + } +} + +/* heterogeneous volume distance sampling: integrate stepping through the + * volume until we reach the end, get absorbed entirely, or run out of + * iterations. this does probabilistically scatter or get transmitted through + * for path tracing where we don't want to branch. */ +ccl_device_forceinline void volume_integrate_heterogeneous( + INTEGRATOR_STATE_ARGS, + Ray *ccl_restrict ray, + ShaderData *ccl_restrict sd, + const RNGState *rng_state, + ccl_global float *ccl_restrict render_buffer, + const float object_step_size, + const VolumeSampleMethod direct_sample_method, + const float3 equiangular_light_P, + VolumeIntegrateResult &result) +{ + PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE); + + /* Prepare for stepping. + * Using a different step offset for the first step avoids banding artifacts. */ + int max_steps; + float step_size, step_shade_offset, steps_offset; + volume_step_init(kg, + rng_state, + object_step_size, + ray->t, + &step_size, + &step_shade_offset, + &steps_offset, + &max_steps); + + /* Initialize volume integration state. */ + VolumeIntegrateState vstate ccl_optional_struct_init; + vstate.start_t = 0.0f; + vstate.end_t = 0.0f; + vstate.absorption_only = true; + vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE); + vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL); + + /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */ + vstate.direct_sample_method = direct_sample_method; + vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS); + if (vstate.use_mis) { + if (vstate.rscatter < 0.5f) { + vstate.rscatter *= 2.0f; + vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE; + } + else { + vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f; + vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR; + } + } + vstate.equiangular_pdf = 0.0f; + vstate.distance_pdf = 1.0f; + + /* Initialize volume integration result. */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + result.direct_throughput = throughput; + result.indirect_throughput = throughput; + + /* Equiangular sampling: compute distance and PDF in advance. */ + if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) { + result.direct_t = volume_equiangular_sample( + ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf); + } + +# ifdef __DENOISING_FEATURES__ + const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) & + PATH_RAY_DENOISING_FEATURES); + float3 accum_albedo = zero_float3(); +# endif + float3 accum_emission = zero_float3(); + + for (int i = 0; i < max_steps; i++) { + /* Advance to new position */ + vstate.end_t = min(ray->t, (i + steps_offset) * step_size); + const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset; + sd->P = ray->P + ray->D * shade_t; + + /* compute segment */ + VolumeShaderCoefficients coeff ccl_optional_struct_init; + if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) { + const int closure_flag = sd->flag; + + /* Evaluate transmittance over segment. */ + const float dt = (vstate.end_t - vstate.start_t); + const float3 transmittance = (closure_flag & SD_EXTINCTION) ? + volume_color_transmittance(coeff.sigma_t, dt) : + one_float3(); + + /* Emission. */ + if (closure_flag & SD_EMISSION) { + /* Only write emission before indirect light scatter position, since we terminate + * stepping at that point if we have already found a direct light scatter position. */ + if (!result.indirect_scatter) { + const float3 emission = volume_emission_integrate( + &coeff, closure_flag, transmittance, dt); + accum_emission += emission; + } + } + + if (closure_flag & SD_EXTINCTION) { + if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) { +# ifdef __DENOISING_FEATURES__ + /* Accumulate albedo for denoising features. */ + if (write_denoising_features && (closure_flag & SD_SCATTER)) { + const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); + accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance); + } +# endif + + /* Scattering and absorption. */ + volume_integrate_step_scattering( + sd, ray, equiangular_light_P, coeff, transmittance, vstate, result); + } + else { + /* Absorption only. */ + result.indirect_throughput *= transmittance; + result.direct_throughput *= transmittance; + } + + /* Stop if nearly all light blocked. */ + if (!result.indirect_scatter) { + if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) { + result.indirect_throughput = zero_float3(); + break; + } + } + else if (!result.direct_scatter) { + if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) { + break; + } + } + } + + /* If we have scattering data for both direct and indirect, we're done. */ + if (result.direct_scatter && result.indirect_scatter) { + break; + } + } + + /* Stop if at the end of the volume. */ + vstate.start_t = vstate.end_t; + if (vstate.start_t == ray->t) { + break; + } + } + + /* Write accumulated emisison. */ + if (!is_zero(accum_emission)) { + kernel_accum_emission( + INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer); + } + +# ifdef __DENOISING_FEATURES__ + /* Write denoising features. */ + if (write_denoising_features) { + kernel_write_denoising_features_volume( + INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer); + } +# endif /* __DENOISING_FEATURES__ */ +} + +# ifdef __EMISSION__ +/* Path tracing: sample point on light and evaluate light shader, then + * queue shadow ray to be traced. */ +ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS, + const ShaderData *ccl_restrict sd, + const RNGState *ccl_restrict rng_state, + LightSample *ccl_restrict ls) +{ + /* Test if there is a light or BSDF that needs direct light. */ + if (!kernel_data.integrator.use_direct_light) { + return false; + } + + /* Sample position on a light. */ + const int path_flag = INTEGRATOR_STATE(path, flag); + const uint bounce = INTEGRATOR_STATE(path, bounce); + float light_u, light_v; + path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v); + + light_distribution_sample_from_volume_segment( + kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls); + + if (ls->shader & SHADER_EXCLUDE_SCATTER) { + return false; + } + + return true; +} + +/* Path tracing: sample point on light and evaluate light shader, then + * queue shadow ray to be traced. */ +ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS, + const ShaderData *ccl_restrict sd, + const RNGState *ccl_restrict rng_state, + const float3 P, + const ShaderVolumePhases *ccl_restrict + phases, + const float3 throughput, + LightSample *ccl_restrict ls) +{ + PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT); + + if (!kernel_data.integrator.use_direct_light) { + return; + } + + /* Sample position on the same light again, now from the shading + * point where we scattered. + * + * TODO: decorrelate random numbers and use light_sample_new_position to + * avoid resampling the CDF. */ + { + const int path_flag = INTEGRATOR_STATE(path, flag); + const uint bounce = INTEGRATOR_STATE(path, bounce); + float light_u, light_v; + path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v); + + if (!light_distribution_sample_from_position( + kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) { + return; + } + } + + /* Evaluate light shader. + * + * TODO: can we reuse sd memory? In theory we can move this after + * integrate_surface_bounce, evaluate the BSDF, and only then evaluate + * the light shader. This could also move to its own kernel, for + * non-constant light sources. */ + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + const float3 light_eval = light_sample_shader_eval( + INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time); + if (is_zero(light_eval)) { + return; + } + + /* Evaluate BSDF. */ + BsdfEval phase_eval ccl_optional_struct_init; + const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval); + + if (ls->shader & SHADER_USE_MIS) { + float mis_weight = power_heuristic(ls->pdf, phase_pdf); + bsdf_eval_mul(&phase_eval, mis_weight); + } + + bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf); + + /* Path termination. */ + const float terminate = path_state_rng_light_termination(kg, rng_state); + if (light_sample_terminate(kg, ls, &phase_eval, terminate)) { + return; + } + + /* Create shadow ray. */ + Ray ray ccl_optional_struct_init; + light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray); + const bool is_light = light_sample_is_light(ls); + + /* Write shadow ray and associated state to global memory. */ + integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Copy state from main path to shadow path. */ + const uint16_t bounce = INTEGRATOR_STATE(path, bounce); + const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce); + uint32_t shadow_flag = INTEGRATOR_STATE(path, flag); + shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0; + shadow_flag |= PATH_RAY_VOLUME_PASS; + const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval); + + if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { + const float3 diffuse_glossy_ratio = (bounce == 0) ? + one_float3() : + INTEGRATOR_STATE(path, diffuse_glossy_ratio); + INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio; + } + + INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag; + INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce; + INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce; + INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase; + + if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) { + INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput; + } + + integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS); + + /* Branch off shadow kernel. */ + INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); +} +# endif + +/* Path tracing: scatter in new direction using phase function */ +ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS, + ShaderData *sd, + const RNGState *rng_state, + const ShaderVolumePhases *phases) +{ + PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT); + + float phase_u, phase_v; + path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v); + + /* Phase closure, sample direction. */ + float phase_pdf; + BsdfEval phase_eval ccl_optional_struct_init; + float3 phase_omega_in ccl_optional_struct_init; + differential3 phase_domega_in ccl_optional_struct_init; + + const int label = shader_volume_phase_sample(kg, + sd, + phases, + phase_u, + phase_v, + &phase_eval, + &phase_omega_in, + &phase_domega_in, + &phase_pdf); + + if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) { + return false; + } + + /* Setup ray. */ + INTEGRATOR_STATE_WRITE(ray, P) = sd->P; + INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in); + INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX; + +# ifdef __RAY_DIFFERENTIALS__ + INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP); + INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in); +# endif + + /* Update throughput. */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf; + INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase; + + if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { + INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3(); + } + + /* Update path state */ + INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf; + INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f; + INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf, + INTEGRATOR_STATE(path, min_ray_pdf)); + + path_state_next(INTEGRATOR_STATE_PASS, label); + return true; +} + +/* get the volume attenuation and emission over line segment defined by + * ray, with the assumption that there are no surfaces blocking light + * between the endpoints. distance sampling is used to decide if we will + * scatter or not. */ +ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS, + Ray *ccl_restrict ray, + ccl_global float *ccl_restrict render_buffer) +{ + ShaderData sd; + shader_setup_from_volume(kg, &sd, ray); + + /* Load random number state. */ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + + /* Sample light ahead of volume stepping, for equiangular sampling. */ + /* TODO: distant lights are ignored now, but could instead use even distribution. */ + LightSample ls ccl_optional_struct_init; + const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE); + const bool have_equiangular_sample = need_light_sample && + integrate_volume_sample_light( + INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) && + (ls.t != FLT_MAX); + + VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ? + volume_stack_sample_method(INTEGRATOR_STATE_PASS) : + VOLUME_SAMPLE_DISTANCE; + + /* Step through volume. */ + const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) { + return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); + }); + + /* TODO: expensive to zero closures? */ + VolumeIntegrateResult result = {}; + volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS, + ray, + &sd, + &rng_state, + render_buffer, + step_size, + direct_sample_method, + ls.P, + result); + + /* Perform path termination. The intersect_closest will have already marked this path + * to be terminated. That will shading evaluating to leave out any scattering closures, + * but emission and absorption are still handled for multiple importance sampling. */ + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); + const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ? + 0.0f : + path_state_continuation_probability(INTEGRATOR_STATE_PASS, + path_flag); + if (probability == 0.0f) { + return VOLUME_PATH_MISSED; + } + + /* Direct light. */ + if (result.direct_scatter) { + const float3 direct_P = ray->P + result.direct_t * ray->D; + result.direct_throughput /= probability; + integrate_volume_direct_light(INTEGRATOR_STATE_PASS, + &sd, + &rng_state, + direct_P, + &result.direct_phases, + result.direct_throughput, + &ls); + } + + /* Indirect light. + * + * Only divide throughput by probability if we scatter. For the attenuation + * case the next surface will already do this division. */ + if (result.indirect_scatter) { + result.indirect_throughput /= probability; + } + INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput; + + if (result.indirect_scatter) { + sd.P = ray->P + result.indirect_t * ray->D; + + if (integrate_volume_phase_scatter( + INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) { + return VOLUME_PATH_SCATTERED; + } + else { + return VOLUME_PATH_MISSED; + } + } + else { + return VOLUME_PATH_ATTENUATED; + } +} + +#endif + +ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS, + ccl_global float *ccl_restrict render_buffer) +{ + PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP); + +#ifdef __VOLUME__ + /* Setup shader data. */ + Ray ray ccl_optional_struct_init; + integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray); + + Intersection isect ccl_optional_struct_init; + integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect); + + /* Set ray length to current segment. */ + ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX; + + /* Clean volume stack for background rays. */ + if (isect.prim == PRIM_NONE) { + volume_stack_clean(INTEGRATOR_STATE_PASS); + } + + VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer); + + if (event == VOLUME_PATH_SCATTERED) { + /* Queue intersect_closest kernel. */ + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + return; + } + else if (event == VOLUME_PATH_MISSED) { + /* End path. */ + INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); + return; + } + else { + /* Continue to background, light or surface. */ + if (isect.prim == PRIM_NONE) { + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, + DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + return; + } + else if (isect.type & PRIMITIVE_LAMP) { + INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, + DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + return; + } + else { + /* Hit a surface, continue with surface kernel unless terminated. */ + const int shader = intersection_get_shader(kg, &isect); + const int flags = kernel_tex_fetch(__shaders, shader).flags; + + integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>( + INTEGRATOR_STATE_PASS, &isect, shader, flags); + return; + } + } +#endif /* __VOLUME__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h new file mode 100644 index 00000000000..8cef9cf31e2 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_state.h @@ -0,0 +1,185 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Integrator State + * + * This file defines the data structures that define the state of a path. Any state that is + * preserved and passed between kernel executions is part of this. + * + * The size of this state must be kept as small as possible, to reduce cache misses and keep memory + * usage under control on GPUs that may execute millions of kernels. + * + * Memory may be allocated and passed along in different ways depending on the device. There may + * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer + * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract + * these differences between devices and experiment with different layouts, macros are used. + * + * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses + * path state. + * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only + * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it. + * + * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState + * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState + * + * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y + * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y + * + * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another + * + * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation + * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions. + * + * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem. + * Can solve it with more macros if we encouter it, but rather ugly so postpone for now. + */ + +#include "kernel/kernel_types.h" + +#include "util/util_types.h" + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Constants + * + * TODO: these could be made dynamic depending on the features used in the scene. */ + +#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE +#define INTEGRATOR_SHADOW_ISECT_SIZE 4 + +/* Data structures */ + +/* Integrator State + * + * CPU rendering path state with AoS layout. */ +typedef struct IntegratorStateCPU { +#define KERNEL_STRUCT_BEGIN(name) struct { +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name; +#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER +#define KERNEL_STRUCT_END(name) \ + } \ + name; +#define KERNEL_STRUCT_END_ARRAY(name, size) \ + } \ + name[size]; +#include "kernel/integrator/integrator_state_template.h" +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER +#undef KERNEL_STRUCT_END +#undef KERNEL_STRUCT_END_ARRAY +} IntegratorStateCPU; + +/* Path Queue + * + * Keep track of which kernels are queued to be executed next in the path + * for GPU rendering. */ +typedef struct IntegratorQueueCounter { + int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]; +} IntegratorQueueCounter; + +/* Integrator State GPU + * + * GPU rendering path state with SoA layout. */ +typedef struct IntegratorStateGPU { +#define KERNEL_STRUCT_BEGIN(name) struct { +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name; +#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER +#define KERNEL_STRUCT_END(name) \ + } \ + name; +#define KERNEL_STRUCT_END_ARRAY(name, size) \ + } \ + name[size]; +#include "kernel/integrator/integrator_state_template.h" +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER +#undef KERNEL_STRUCT_END +#undef KERNEL_STRUCT_END_ARRAY + + /* Count number of queued kernels. */ + IntegratorQueueCounter *queue_counter; + + /* Count number of kernels queued for specific shaders. */ + int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM]; + + /* Index of path which will be used by a next shadow catcher split. */ + int *next_shadow_catcher_path_index; +} IntegratorStateGPU; + +/* Abstraction + * + * Macros to access data structures on different devices. + * + * Note that there is a special access function for the shadow catcher state. This access is to + * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors + * from a kernel which operates on a shadow catcher state will cause bad memory acces. */ + +#ifdef __KERNEL_CPU__ + +/* Scalar access on CPU. */ + +typedef IntegratorStateCPU *ccl_restrict IntegratorState; + +# define INTEGRATOR_STATE_ARGS \ + ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \ + IntegratorStateCPU *ccl_restrict state +# define INTEGRATOR_STATE_CONST_ARGS \ + ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \ + const IntegratorStateCPU *ccl_restrict state +# define INTEGRATOR_STATE_PASS kg, state + +# define INTEGRATOR_STATE_PASS_NULL kg, NULL +# define INTEGRATOR_STATE_IS_NULL (state == NULL) + +# define INTEGRATOR_STATE(nested_struct, member) \ + (((const IntegratorStateCPU *)state)->nested_struct.member) +# define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member) + +# define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \ + (((const IntegratorStateCPU *)state)->nested_struct[array_index].member) +# define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \ + ((state)->nested_struct[array_index].member) + +#else /* __KERNEL_CPU__ */ + +/* Array access on GPU with Structure-of-Arrays. */ + +typedef int IntegratorState; + +# define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state +# define INTEGRATOR_STATE_CONST_ARGS \ + const KernelGlobals *ccl_restrict kg, const IntegratorState state +# define INTEGRATOR_STATE_PASS kg, state + +# define INTEGRATOR_STATE_PASS_NULL kg, -1 +# define INTEGRATOR_STATE_IS_NULL (state == -1) + +# define INTEGRATOR_STATE(nested_struct, member) \ + kernel_integrator_state.nested_struct.member[state] +# define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member) + +# define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \ + kernel_integrator_state.nested_struct[array_index].member[state] +# define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \ + INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) + +#endif /* __KERNEL_CPU__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h new file mode 100644 index 00000000000..8477efd7b66 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_state_flow.h @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_types.h" +#include "util/util_atomic.h" + +CCL_NAMESPACE_BEGIN + +/* Control Flow + * + * Utilities for control flow between kernels. The implementation may differ per device + * or even be handled on the host side. To abstract such differences, experiment with + * different implementations and for debugging, this is abstracted using macros. + * + * There is a main path for regular path tracing camera for path tracing. Shadows for next + * event estimation branch off from this into their own path, that may be computed in + * parallel while the main path continues. + * + * Each kernel on the main path must call one of these functions. These may not be called + * multiple times from the same kernel. + * + * INTEGRATOR_PATH_INIT(next_kernel) + * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) + * INTEGRATOR_PATH_TERMINATE(current_kernel) + * + * For the shadow path similar functions are used, and again each shadow kernel must call + * one of them, and only once. + */ + +#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0) +#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0) + +#ifdef __KERNEL_GPU__ + +# define INTEGRATOR_PATH_INIT(next_kernel) \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ + 1); \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; +# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ + atomic_fetch_and_sub_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ + 1); \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; +# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ + atomic_fetch_and_sub_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; + +# define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ + 1); \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; +# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ + atomic_fetch_and_sub_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ + 1); \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; +# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ + atomic_fetch_and_sub_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; + +# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ + { \ + const int key_ = key; \ + atomic_fetch_and_add_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \ + INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ + 1); \ + } +# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ + { \ + const int key_ = key; \ + atomic_fetch_and_sub_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ + atomic_fetch_and_add_uint32( \ + &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \ + INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \ + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ + 1); \ + } + +#else + +# define INTEGRATOR_PATH_INIT(next_kernel) \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; +# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ + { \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \ + (void)key; \ + } +# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ + { \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \ + (void)current_kernel; \ + } +# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ + { \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \ + (void)current_kernel; \ + } +# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ + { \ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \ + (void)key; \ + (void)current_kernel; \ + } + +# define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; +# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ + { \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \ + (void)current_kernel; \ + } +# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ + { \ + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \ + (void)current_kernel; \ + } + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h new file mode 100644 index 00000000000..41dd1bfcdbf --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_state_template.h @@ -0,0 +1,163 @@ + +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/************************************ Path State *****************************/ + +KERNEL_STRUCT_BEGIN(path) +/* Index of a pixel within the device render buffer where this path will write its result. + * To get an actual offset within the buffer the value needs to be multiplied by the + * `kernel_data.film.pass_stride`. + * + * The multiplication is delayed for later, so that state can use 32bit integer. */ +KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING) +/* Current sample number. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING) +/* Current ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current diffuse ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current glossy ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current transmission ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current volume ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current volume bounds ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current transparent ray bounce depth. */ +KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING) +/* DeviceKernel bit indicating queued kernels. + * TODO: reduce size? */ +KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING) +/* Random number generator seed. */ +KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING) +/* Random number dimension offset. */ +KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING) +/* enum PathRayFlag */ +KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING) +/* Multiple importance sampling + * The PDF of BSDF sampling at the last scatter point, and distance to the + * last scatter point minus the last ray segment. This distance lets us + * compute the complete distance through transparent surfaces and volumes. */ +KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING) +/* Filter glossy. */ +KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING) +/* Throughput. */ +KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +/* Ratio of throughput to distinguish diffuse and glossy render passes. */ +KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES) +/* Denoising. */ +KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING) +/* Shader sorting. */ +/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */ +KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END(path) + +/************************************** Ray ***********************************/ + +KERNEL_STRUCT_BEGIN(ray) +KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END(ray) + +/*************************** Intersection result ******************************/ + +/* Result from scene intersection. */ +KERNEL_STRUCT_BEGIN(isect) +KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING) +/* TODO: exclude for GPU. */ +KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END(isect) + +/*************** Subsurface closure state for subsurface kernel ***************/ + +KERNEL_STRUCT_BEGIN(subsurface) +KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_END(subsurface) + +/********************************** Volume Stack ******************************/ + +KERNEL_STRUCT_BEGIN(volume_stack) +KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME) +KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME) +KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE) + +/********************************* Shadow Path State **************************/ + +KERNEL_STRUCT_BEGIN(shadow_path) +/* Current ray bounce depth. */ +KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING) +/* Current transparent ray bounce depth. */ +KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING) +/* DeviceKernel bit indicating queued kernels. + * TODO: reduce size? */ +KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING) +/* enum PathRayFlag */ +KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING) +/* Throughput. */ +KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +/* Throughput for shadow pass. */ +KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS) +/* Ratio of throughput to distinguish diffuse and glossy render passes. */ +KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES) +/* Number of intersections found by ray-tracing. */ +KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END(shadow_path) + +/********************************** Shadow Ray *******************************/ + +KERNEL_STRUCT_BEGIN(shadow_ray) +KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END(shadow_ray) + +/*********************** Shadow Intersection result **************************/ + +/* Result from scene intersection. */ +KERNEL_STRUCT_BEGIN(shadow_isect) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING) +/* TODO: exclude for GPU. */ +KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE) + +/**************************** Shadow Volume Stack *****************************/ + +KERNEL_STRUCT_BEGIN(shadow_volume_stack) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME) +KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME) +KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE) diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h new file mode 100644 index 00000000000..cdf412fe22f --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -0,0 +1,273 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state.h" +#include "kernel/kernel_differential.h" + +CCL_NAMESPACE_BEGIN + +/* Ray */ + +ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS, + const Ray *ccl_restrict ray) +{ + INTEGRATOR_STATE_WRITE(ray, P) = ray->P; + INTEGRATOR_STATE_WRITE(ray, D) = ray->D; + INTEGRATOR_STATE_WRITE(ray, t) = ray->t; + INTEGRATOR_STATE_WRITE(ray, time) = ray->time; + INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP; + INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD; +} + +ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS, + Ray *ccl_restrict ray) +{ + ray->P = INTEGRATOR_STATE(ray, P); + ray->D = INTEGRATOR_STATE(ray, D); + ray->t = INTEGRATOR_STATE(ray, t); + ray->time = INTEGRATOR_STATE(ray, time); + ray->dP = INTEGRATOR_STATE(ray, dP); + ray->dD = INTEGRATOR_STATE(ray, dD); +} + +/* Shadow Ray */ + +ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS, + const Ray *ccl_restrict ray) +{ + INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P; + INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D; + INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t; + INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time; + INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP; +} + +ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS, + Ray *ccl_restrict ray) +{ + ray->P = INTEGRATOR_STATE(shadow_ray, P); + ray->D = INTEGRATOR_STATE(shadow_ray, D); + ray->t = INTEGRATOR_STATE(shadow_ray, t); + ray->time = INTEGRATOR_STATE(shadow_ray, time); + ray->dP = INTEGRATOR_STATE(shadow_ray, dP); + ray->dD = differential_zero_compact(); +} + +/* Intersection */ + +ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS, + const Intersection *ccl_restrict isect) +{ + INTEGRATOR_STATE_WRITE(isect, t) = isect->t; + INTEGRATOR_STATE_WRITE(isect, u) = isect->u; + INTEGRATOR_STATE_WRITE(isect, v) = isect->v; + INTEGRATOR_STATE_WRITE(isect, object) = isect->object; + INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim; + INTEGRATOR_STATE_WRITE(isect, type) = isect->type; +#ifdef __EMBREE__ + INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng; +#endif +} + +ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS, + Intersection *ccl_restrict isect) +{ + isect->prim = INTEGRATOR_STATE(isect, prim); + isect->object = INTEGRATOR_STATE(isect, object); + isect->type = INTEGRATOR_STATE(isect, type); + isect->u = INTEGRATOR_STATE(isect, u); + isect->v = INTEGRATOR_STATE(isect, v); + isect->t = INTEGRATOR_STATE(isect, t); +#ifdef __EMBREE__ + isect->Ng = INTEGRATOR_STATE(isect, Ng); +#endif +} + +ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS, + int i) +{ + VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object), + INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)}; + return entry; +} + +ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS, + int i, + VolumeStack entry) +{ + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object; + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader; +} + +ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS) +{ + return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ? + INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE : + true; +} + +/* Shadow Intersection */ + +ccl_device_forceinline void integrator_state_write_shadow_isect( + INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index) +{ + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type; +#ifdef __EMBREE__ + INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng; +#endif +} + +ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS, + Intersection *ccl_restrict isect, + const int index) +{ + isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim); + isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object); + isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type); + isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u); + isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v); + isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t); +#ifdef __EMBREE__ + isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng); +#endif +} + +ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS) +{ + if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) { + for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) { + INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY( + volume_stack, i, object); + INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY( + volume_stack, i, shader); + } + } +} + +ccl_device_forceinline VolumeStack +integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i) +{ + VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object), + INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)}; + return entry; +} + +ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty( + INTEGRATOR_STATE_CONST_ARGS) +{ + return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ? + INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE : + true; +} + +ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS, + int i, + VolumeStack entry) +{ + INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object; + INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader; +} + +#if defined(__KERNEL_GPU__) +ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state, + const IntegratorState state) +{ + int index; + + /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */ + +# define KERNEL_STRUCT_BEGIN(name) \ + index = 0; \ + do { + +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct.name != nullptr) { \ + kernel_integrator_state.parent_struct.name[to_state] = \ + kernel_integrator_state.parent_struct.name[state]; \ + } + +# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct[index].name != nullptr) { \ + kernel_integrator_state.parent_struct[index].name[to_state] = \ + kernel_integrator_state.parent_struct[index].name[state]; \ + } + +# define KERNEL_STRUCT_END(name) \ + } \ + while (false) \ + ; + +# define KERNEL_STRUCT_END_ARRAY(name, array_size) \ + ++index; \ + } \ + while (index < array_size) \ + ; + +# include "kernel/integrator/integrator_state_template.h" + +# undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_ARRAY_MEMBER +# undef KERNEL_STRUCT_END +# undef KERNEL_STRUCT_END_ARRAY +} + +ccl_device_inline void integrator_state_move(const IntegratorState to_state, + const IntegratorState state) +{ + integrator_state_copy_only(to_state, state); + + INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; +} + +#endif + +/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths + * after this function. */ +ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS) +{ +#if defined(__KERNEL_GPU__) + const IntegratorState to_state = atomic_fetch_and_add_uint32( + &kernel_integrator_state.next_shadow_catcher_path_index[0], 1); + + integrator_state_copy_only(to_state, state); + + kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS; + + /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray + * and no sorting yet. */ + kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0); + kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] == + nullptr); +#else + + IntegratorStateCPU *ccl_restrict split_state = state + 1; + + *split_state = *state; + + split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS; +#endif +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h new file mode 100644 index 00000000000..9490738404e --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_subsurface.h @@ -0,0 +1,623 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_shader.h" + +#include "kernel/bvh/bvh.h" + +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bssrdf.h" +#include "kernel/closure/volume.h" + +#include "kernel/integrator/integrator_intersect_volume_stack.h" + +CCL_NAMESPACE_BEGIN + +#ifdef __SUBSURFACE__ + +ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc) +{ + /* We should never have two consecutive BSSRDF bounces, the second one should + * be converted to a diffuse BSDF to avoid this. */ + kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR)); + + /* Setup path state for intersect_subsurface kernel. */ + const Bssrdf *bssrdf = (const Bssrdf *)sc; + + /* Setup ray into surface. */ + INTEGRATOR_STATE_WRITE(ray, P) = sd->P; + INTEGRATOR_STATE_WRITE(ray, D) = sd->N; + INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX; + INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP); + INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact(); + + /* Pass along object info, reusing isect to save memory. */ + INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng; + INTEGRATOR_STATE_WRITE(isect, object) = sd->object; + + /* Pass BSSRDF parameters. */ + const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag); + INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE; + INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc); + + if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { + if (INTEGRATOR_STATE(path, bounce) == 0) { + INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3(); + } + } + + INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo; + INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius; + INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness; + INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy; + + return LABEL_SUBSURFACE_SCATTER; +} + +ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd) +{ + /* Get bump mapped normal from shader evaluation at exit point. */ + float3 N = sd->N; + if (sd->flag & SD_HAS_BSSRDF_BUMP) { + N = shader_bssrdf_normal(sd); + } + + /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */ + sd->flag &= ~SD_CLOSURE_FLAGS; + sd->num_closure = 0; + sd->num_closure_left = kernel_data.max_closures; + + const float3 weight = one_float3(); + const float roughness = INTEGRATOR_STATE(subsurface, roughness); + +# ifdef __PRINCIPLED__ + if (roughness != FLT_MAX) { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc( + sd, sizeof(PrincipledDiffuseBsdf), weight); + + if (bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular Disney principled diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + } + else +# endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); + + if (bsdf) { + bsdf->N = N; + sd->flag |= bsdf_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } + } +} + +/* Random walk subsurface scattering. + * + * "Practical and Controllable Subsurface Scattering for Production Path + * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ + +/* Support for anisotropy from: + * "Path Traced Subsurface Scattering using Anisotropic Phase Functions + * and Non-Exponential Free Flights". + * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery. + * https://graphics.pixar.com/library/PathTracedSubsurface/ */ + +ccl_device void subsurface_random_walk_remap( + const float albedo, const float d, float g, float *sigma_t, float *alpha) +{ + /* Compute attenuation and scattering coefficients from albedo. */ + const float g2 = g * g; + const float g3 = g2 * g; + const float g4 = g3 * g; + const float g5 = g4 * g; + const float g6 = g5 * g; + const float g7 = g6 * g; + + const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 + + 9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 + + -23.6264803333f * g6 + 7.21067002658f * g7; + const float B = 4.98511194385f + + 0.127355959438f * + expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 + + -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 + + 559.009054474f * g7); + const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 + + -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 + + 38.5402837518f * g6 + -12.7181042538f * g7; + const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 + + 17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 + + -58.5106008578f * g6 + 15.8478295021f * g7; + const float E = 4.23190299701f + + 0.00310603949088f * + expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 + + -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 + + 1303.5318055f * g7); + const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 + + -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 + + 302.85712436f * g6 + -87.4370473567f * g7; + + const float blend = powf(albedo, 0.25f); + + *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) + + blend * D * powf(atanf(E * albedo), F); + *alpha = clamp(*alpha, 0.0f, 0.999999f); // because of numerical precision + + float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f); + *sigma_t = sigma_t_prime / (1.0f - g); +} + +ccl_device void subsurface_random_walk_coefficients(const float3 albedo, + const float3 radius, + const float anisotropy, + float3 *sigma_t, + float3 *alpha, + float3 *throughput) +{ + float sigma_t_x, sigma_t_y, sigma_t_z; + float alpha_x, alpha_y, alpha_z; + + subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x); + subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y); + subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z); + + /* Throughput already contains closure weight at this point, which includes the + * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo + * which will be added through scattering. */ + *throughput = safe_divide_color(*throughput, albedo); + + /* With low albedo values (like 0.025) we get diffusion_length 1.0 and + * infinite phase functions. To avoid a sharp discontinuity as we go from + * such values to 0.0, increase alpha and reduce the throughput to compensate. */ + const float min_alpha = 0.2f; + if (alpha_x < min_alpha) { + (*throughput).x *= alpha_x / min_alpha; + alpha_x = min_alpha; + } + if (alpha_y < min_alpha) { + (*throughput).y *= alpha_y / min_alpha; + alpha_y = min_alpha; + } + if (alpha_z < min_alpha) { + (*throughput).z *= alpha_z / min_alpha; + alpha_z = min_alpha; + } + + *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z); + *alpha = make_float3(alpha_x, alpha_y, alpha_z); +} + +/* References for Dwivedi sampling: + * + * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering" + * by Jaroslav KÅ™ivánek and Eugene d'Eon (SIGGRAPH 2014) + * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/ + * + * [2] "Improving the Dwivedi Sampling Scheme" + * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016) + * https://cg.ivd.kit.edu/1951.php + * + * [3] "Zero-Variance Theory for Efficient Subsurface Scattering" + * by Eugene d'Eon and Jaroslav KÅ™ivánek (SIGGRAPH 2020) + * https://iliyan.com/publications/RenderingCourse2020 + */ + +ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta) +{ + /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */ + return 1.0f / ((v - cos_theta) * phase_log); +} + +ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand) +{ + /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)` + * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation, + * we can implement the power function like this. */ + return v - (v + 1.0f) * expf(-rand * phase_log); +} + +ccl_device_forceinline float diffusion_length_dwivedi(float alpha) +{ + /* Eq. 67 from [3] */ + return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha)); +} + +ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv) +{ + float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta); + float phi = M_2PI_F * randv; + float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta); + + float3 T, B; + make_orthonormals(D, &T, &B); + return dir.x * T + dir.y * B + dir.z * D; +} + +ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t, + float t, + bool hit, + float3 *transmittance) +{ + float3 T = volume_color_transmittance(sigma_t, t); + if (transmittance) { + *transmittance = T; + } + return hit ? T : sigma_t * T; +} + +/* Define the below variable to get the similarity code active, + * and the value represents the cutoff level */ +# define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9 + +ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS, + RNGState rng_state, + Ray &ray, + LocalIntersection &ss_isect) +{ + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + + const float3 P = INTEGRATOR_STATE(ray, P); + const float3 N = INTEGRATOR_STATE(ray, D); + const float ray_dP = INTEGRATOR_STATE(ray, dP); + const float time = INTEGRATOR_STATE(ray, time); + const float3 Ng = INTEGRATOR_STATE(isect, Ng); + const int object = INTEGRATOR_STATE(isect, object); + + /* Sample diffuse surface scatter into the object. */ + float3 D; + float pdf; + sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf); + if (dot(-Ng, D) <= 0.0f) { + return false; + } + + /* Setup ray. */ + ray.P = ray_offset(P, -Ng); + ray.D = D; + ray.t = FLT_MAX; + ray.time = time; + ray.dP = ray_dP; + ray.dD = differential_zero_compact(); + +# ifndef __KERNEL_OPTIX__ + /* Compute or fetch object transforms. */ + Transform ob_itfm ccl_optional_struct_init; + Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm); +# endif + + /* Convert subsurface to volume coefficients. + * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */ + const float3 albedo = INTEGRATOR_STATE(subsurface, albedo); + const float3 radius = INTEGRATOR_STATE(subsurface, radius); + const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy); + + float3 sigma_t, alpha; + float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput); + subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput); + float3 sigma_s = sigma_t * alpha; + + /* Theoretically it should be better to use the exact alpha for the channel we're sampling at + * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange + * for making the code significantly more complex and slower (if direction sampling depends on + * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on). + * + * Since the strength of the guided sampling increases as alpha gets lower, using a value that + * is too low results in fireflies while one that's too high just gives a bit more noise. + * Therefore, the code here uses the highest of the three albedos to be safe. */ + const float diffusion_length = diffusion_length_dwivedi(max3(alpha)); + + if (diffusion_length == 1.0f) { + /* With specific values of alpha the length might become 1, which in asymptotic makes phase to + * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding + * numerical issues and extra unneeded work. */ + return false; + } + + /* Precompute term for phase sampling. */ + const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f)); + + /* Modify state for RNGs, decorrelated from other paths. */ + rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef); + + /* Random walk until we hit the surface again. */ + bool hit = false; + bool have_opposite_interface = false; + float opposite_distance = 0.0f; + + /* Todo: Disable for alpha>0.999 or so? */ + /* Our heuristic, a compromise between guiding and classic. */ + const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f)); + +# ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL + float3 sigma_s_star = sigma_s * (1.0f - anisotropy); + float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star; + float3 sigma_t_org = sigma_t; + float3 sigma_s_org = sigma_s; + const float anisotropy_org = anisotropy; + const float guided_fraction_org = guided_fraction; +# endif + + for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) { + /* Advance random number offset. */ + rng_state.rng_offset += PRNG_BOUNCE_NUM; + +# ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL + // shadow with local variables according to depth + float anisotropy, guided_fraction; + float3 sigma_s, sigma_t; + if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) { + anisotropy = anisotropy_org; + guided_fraction = guided_fraction_org; + sigma_t = sigma_t_org; + sigma_s = sigma_s_org; + } + else { + anisotropy = 0.0f; + guided_fraction = 0.75f; // back to isotropic heuristic from Blender + sigma_t = sigma_t_star; + sigma_s = sigma_s_star; + } +# endif + + /* Sample color channel, use MIS with balance heuristic. */ + float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL); + float3 channel_pdf; + int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf); + float sample_sigma_t = volume_channel_get(sigma_t, channel); + float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE); + + /* We need the result of the raycast to compute the full guided PDF, so just remember the + * relevant terms to avoid recomputing them later. */ + float backward_fraction = 0.0f; + float forward_pdf_factor = 0.0f; + float forward_stretching = 1.0f; + float backward_pdf_factor = 0.0f; + float backward_stretching = 1.0f; + + /* For the initial ray, we already know the direction, so just do classic distance sampling. */ + if (bounce > 0) { + /* Decide whether we should use guided or classic sampling. */ + bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction); + + /* Determine if we want to sample away from the incoming interface. + * This only happens if we found a nearby opposite interface, and the probability for it + * depends on how close we are to it already. + * This probability term comes from the recorded presentation of [3]. */ + bool guide_backward = false; + if (have_opposite_interface) { + /* Compute distance of the random walk between the tangent plane at the starting point + * and the assumed opposite interface (the parallel plane that contains the point we + * found in our ray query for the opposite side). */ + float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance); + backward_fraction = 1.0f / + (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length)); + guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction; + } + + /* Sample scattering direction. */ + float scatter_u, scatter_v; + path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v); + float cos_theta; + float hg_pdf; + if (guided) { + cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u); + /* The backwards guiding distribution is just mirrored along sd->N, so swapping the + * sign here is enough to sample from that instead. */ + if (guide_backward) { + cos_theta = -cos_theta; + } + float3 newD = direction_from_cosine(N, cos_theta, scatter_v); + hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy); + ray.D = newD; + } + else { + float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf); + cos_theta = dot(newD, N); + ray.D = newD; + } + + /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic). + * Since phase sampling is channel-independent, we can get away with applying a factor + * to the guided PDF, which implicitly means pulling out the classic PDF term and letting + * it cancel with an equivalent term in the numerator of the full estimator. + * For the backward PDF, we again reuse the same probability distribution with a sign swap. + */ + forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) / + hg_pdf; + backward_pdf_factor = M_1_2PI_F * + eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf; + + /* Prepare distance sampling. + * For the backwards case, this also needs the sign swapped since now directions against + * sd->N (and therefore with negative cos_theta) are preferred. */ + forward_stretching = (1.0f - cos_theta / diffusion_length); + backward_stretching = (1.0f + cos_theta / diffusion_length); + if (guided) { + sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching; + } + } + + /* Sample direction along ray. */ + float t = -logf(1.0f - randt) / sample_sigma_t; + + /* On the first bounce, we use the raycast to check if the opposite side is nearby. + * If yes, we will later use backwards guided sampling in order to have a decent + * chance of connecting to it. + * Todo: Maybe use less than 10 times the mean free path? */ + ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t; + scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1); + hit = (ss_isect.num_hits > 0); + + if (hit) { +# ifdef __KERNEL_OPTIX__ + /* t is always in world space with OptiX. */ + ray.t = ss_isect.hits[0].t; +# else + /* Compute world space distance to surface hit. */ + float3 D = transform_direction(&ob_itfm, ray.D); + D = normalize(D) * ss_isect.hits[0].t; + ray.t = len(transform_direction(&ob_tfm, D)); +# endif + } + + if (bounce == 0) { + /* Check if we hit the opposite side. */ + if (hit) { + have_opposite_interface = true; + opposite_distance = dot(ray.P + ray.t * ray.D - P, -N); + } + /* Apart from the opposite side check, we were supposed to only trace up to distance t, + * so check if there would have been a hit in that case. */ + hit = ray.t < t; + } + + /* Use the distance to the exit point for the throughput update if we found one. */ + if (hit) { + t = ray.t; + } + else if (bounce == 0) { + /* Restore original position if nothing was hit after the first bounce, + * without the ray_offset() that was added to avoid self-intersection. + * Otherwise if that offset is relatively large compared to the scattering + * radius, we never go back up high enough to exit the surface. */ + ray.P = P; + } + + /* Advance to new scatter location. */ + ray.P += t * ray.D; + + float3 transmittance; + float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance); + if (bounce > 0) { + /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */ + float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL); + + if (have_opposite_interface) { + /* First step of MIS: Depending on geometry we might have two methods for guided + * sampling, so perform MIS between them. */ + float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL); + guided_pdf = mix( + guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction); + } + else { + /* Just include phase sampling factor otherwise. */ + guided_pdf *= forward_pdf_factor; + } + + /* Now we apply the MIS balance heuristic between the classic and guided sampling. */ + pdf = mix(pdf, guided_pdf, guided_fraction); + } + + /* Finally, we're applying MIS again to combine the three color channels. + * Altogether, the MIS computation combines up to nine different estimators: + * {classic, guided, backward_guided} x {r, g, b} */ + throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf); + + if (hit) { + /* If we hit the surface, we are done. */ + break; + } + else if (throughput.x < VOLUME_THROUGHPUT_EPSILON && + throughput.y < VOLUME_THROUGHPUT_EPSILON && + throughput.z < VOLUME_THROUGHPUT_EPSILON) { + /* Avoid unnecessary work and precision issue when throughput gets really small. */ + break; + } + } + + if (hit) { + kernel_assert(isfinite3_safe(throughput)); + INTEGRATOR_STATE_WRITE(path, throughput) = throughput; + } + + return hit; +} + +ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS) +{ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + + Ray ray ccl_optional_struct_init; + LocalIntersection ss_isect ccl_optional_struct_init; + + if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) { + return false; + } + +# ifdef __VOLUME__ + /* Update volume stack if needed. */ + if (kernel_data.integrator.use_volumes) { + const int object = intersection_get_object(kg, &ss_isect.hits[0]); + const int object_flag = kernel_tex_fetch(__object_flag, object); + + if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) { + float3 P = INTEGRATOR_STATE(ray, P); + const float3 Ng = INTEGRATOR_STATE(isect, Ng); + const float3 offset_P = ray_offset(P, -Ng); + + integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P); + } + } +# endif /* __VOLUME__ */ + + /* Pretend ray is coming from the outside towards the exit point. This ensures + * correct front/back facing normals. + * TODO: find a more elegant solution? */ + ray.P += ray.D * ray.t * 2.0f; + ray.D = -ray.D; + + integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]); + integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray); + + /* Advanced random number offset for bounce. */ + INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM; + + const int shader = intersection_get_shader(kg, &ss_isect.hits[0]); + const int shader_flags = kernel_tex_fetch(__shaders, shader).flags; + if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) { + INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, + shader); + } + else { + INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, + shader); + } + + return true; +} + +#endif /* __SUBSURFACE__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h new file mode 100644 index 00000000000..d53070095f0 --- /dev/null +++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h @@ -0,0 +1,223 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Volume Stack + * + * This is an array of object/shared ID's that the current segment of the path + * is inside of. */ + +template<typename StackReadOp, typename StackWriteOp> +ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, + const ShaderData *sd, + StackReadOp stack_read, + StackWriteOp stack_write) +{ + /* todo: we should have some way for objects to indicate if they want the + * world shader to work inside them. excluding it by default is problematic + * because non-volume objects can't be assumed to be closed manifolds */ + if (!(sd->flag & SD_HAS_VOLUME)) { + return; + } + + if (sd->flag & SD_BACKFACING) { + /* Exit volume object: remove from stack. */ + for (int i = 0;; i++) { + VolumeStack entry = stack_read(i); + if (entry.shader == SHADER_NONE) { + break; + } + + if (entry.object == sd->object) { + /* Shift back next stack entries. */ + do { + entry = stack_read(i + 1); + stack_write(i, entry); + i++; + } while (entry.shader != SHADER_NONE); + + return; + } + } + } + else { + /* Enter volume object: add to stack. */ + int i; + for (i = 0;; i++) { + VolumeStack entry = stack_read(i); + if (entry.shader == SHADER_NONE) { + break; + } + + /* Already in the stack? then we have nothing to do. */ + if (entry.object == sd->object) { + return; + } + } + + /* If we exceed the stack limit, ignore. */ + if (i >= VOLUME_STACK_SIZE - 1) { + return; + } + + /* Add to the end of the stack. */ + const VolumeStack new_entry = {sd->object, sd->shader}; + const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE}; + stack_write(i, new_entry); + stack_write(i + 1, empty_entry); + } +} + +ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd) +{ + volume_stack_enter_exit( + INTEGRATOR_STATE_PASS, + sd, + [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); }, + [=](const int i, const VolumeStack entry) { + integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry); + }); +} + +ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd) +{ + volume_stack_enter_exit( + INTEGRATOR_STATE_PASS, + sd, + [=](const int i) { + return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i); + }, + [=](const int i, const VolumeStack entry) { + integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry); + }); +} + +/* Clean stack after the last bounce. + * + * It is expected that all volumes are closed manifolds, so at the time when ray + * hits nothing (for example, it is a last bounce which goes to environment) the + * only expected volume in the stack is the world's one. All the rest volume + * entries should have been exited already. + * + * This isn't always true because of ray intersection precision issues, which + * could lead us to an infinite non-world volume in the stack, causing render + * artifacts. + * + * Use this function after the last bounce to get rid of all volumes apart from + * the world's one after the last bounce to avoid render artifacts. + */ +ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS) +{ + if (kernel_data.background.volume_shader != SHADER_NONE) { + /* Keep the world's volume in stack. */ + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE; + } + else { + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE; + } +} + +template<typename StackReadOp> +ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read) +{ + float step_size = FLT_MAX; + + for (int i = 0;; i++) { + VolumeStack entry = stack_read(i); + if (entry.shader == SHADER_NONE) { + break; + } + + int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags; + + bool heterogeneous = false; + + if (shader_flag & SD_HETEROGENEOUS_VOLUME) { + heterogeneous = true; + } + else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) { + /* We want to render world or objects without any volume grids + * as homogeneous, but can only verify this at run-time since other + * heterogeneous volume objects may be using the same shader. */ + int object = entry.object; + if (object != OBJECT_NONE) { + int object_flag = kernel_tex_fetch(__object_flag, object); + if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) { + heterogeneous = true; + } + } + } + + if (heterogeneous) { + float object_step_size = object_volume_step_size(kg, entry.object); + object_step_size *= kernel_data.integrator.volume_step_rate; + step_size = fminf(object_step_size, step_size); + } + } + + return step_size; +} + +typedef enum VolumeSampleMethod { + VOLUME_SAMPLE_NONE = 0, + VOLUME_SAMPLE_DISTANCE = (1 << 0), + VOLUME_SAMPLE_EQUIANGULAR = (1 << 1), + VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR), +} VolumeSampleMethod; + +ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS) +{ + VolumeSampleMethod method = VOLUME_SAMPLE_NONE; + + for (int i = 0;; i++) { + VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); + if (entry.shader == SHADER_NONE) { + break; + } + + int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags; + + if (shader_flag & SD_VOLUME_MIS) { + /* Multiple importance sampling. */ + return VOLUME_SAMPLE_MIS; + } + else if (shader_flag & SD_VOLUME_EQUIANGULAR) { + /* Distance + equiangular sampling -> multiple importance sampling. */ + if (method == VOLUME_SAMPLE_DISTANCE) { + return VOLUME_SAMPLE_MIS; + } + + /* Only equiangular sampling. */ + method = VOLUME_SAMPLE_EQUIANGULAR; + } + else { + /* Distance + equiangular sampling -> multiple importance sampling. */ + if (method == VOLUME_SAMPLE_EQUIANGULAR) { + return VOLUME_SAMPLE_MIS; + } + + /* Distance sampling only. */ + method = VOLUME_SAMPLE_DISTANCE; + } + } + + return method; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 61653d328f1..9e12d24dcf4 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -14,751 +14,501 @@ * limitations under the License. */ +#pragma once + +#include "kernel_adaptive_sampling.h" +#include "kernel_random.h" +#include "kernel_shadow_catcher.h" +#include "kernel_write_passes.h" + CCL_NAMESPACE_BEGIN -/* BSDF Eval +/* -------------------------------------------------------------------- + * BSDF Evaluation * - * BSDF evaluation result, split per BSDF type. This is used to accumulate - * render passes separately. */ - -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd); + * BSDF evaluation result, split between diffuse and glossy. This is used to + * accumulate render passes separately. Note that reflection, transmission + * and volume scattering are written to different render passes, but we assume + * that only one of those can happen at a bounce, and so do not need to accumulate + * them separately. */ -ccl_device_inline void bsdf_eval_init(BsdfEval *eval, - ClosureType type, - float3 value, - int use_light_pass) +ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value) { -#ifdef __PASSES__ - eval->use_light_pass = use_light_pass; - - if (eval->use_light_pass) { - eval->diffuse = zero_float3(); - eval->glossy = zero_float3(); - eval->transmission = zero_float3(); - eval->transparent = zero_float3(); - eval->volume = zero_float3(); - - if (type == CLOSURE_BSDF_TRANSPARENT_ID) - eval->transparent = value; - else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) - eval->diffuse = value; - else if (CLOSURE_IS_BSDF_GLOSSY(type)) - eval->glossy = value; - else if (CLOSURE_IS_BSDF_TRANSMISSION(type)) - eval->transmission = value; - else if (CLOSURE_IS_PHASE(type)) - eval->volume = value; - } - else -#endif - { + eval->diffuse = zero_float3(); + eval->glossy = zero_float3(); + + if (is_diffuse) { eval->diffuse = value; } -#ifdef __SHADOW_TRICKS__ - eval->sum_no_mis = zero_float3(); -#endif + else { + eval->glossy = value; + } } ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, - ClosureType type, + const bool is_diffuse, float3 value, float mis_weight) { -#ifdef __SHADOW_TRICKS__ - eval->sum_no_mis += value; -#endif value *= mis_weight; -#ifdef __PASSES__ - if (eval->use_light_pass) { - if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) - eval->diffuse += value; - else if (CLOSURE_IS_BSDF_GLOSSY(type)) - eval->glossy += value; - else if (CLOSURE_IS_BSDF_TRANSMISSION(type)) - eval->transmission += value; - else if (CLOSURE_IS_PHASE(type)) - eval->volume += value; - - /* skipping transparent, this function is used by for eval(), will be zero then */ - } - else -#endif - { - eval->diffuse += value; - } -} -ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) -{ -#ifdef __PASSES__ - if (eval->use_light_pass) { - return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) && - is_zero(eval->transparent) && is_zero(eval->volume); + if (is_diffuse) { + eval->diffuse += value; } - else -#endif - { - return is_zero(eval->diffuse); + else { + eval->glossy += value; } } -ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value) +ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) { -#ifdef __PASSES__ - if (eval->use_light_pass) { - eval->diffuse *= value; - eval->glossy *= value; - eval->transmission *= value; - eval->volume *= value; - - /* skipping transparent, this function is used by for eval(), will be zero then */ - } - else -#endif - { - eval->diffuse *= value; - } + return is_zero(eval->diffuse) && is_zero(eval->glossy); } ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) { -#ifdef __SHADOW_TRICKS__ - eval->sum_no_mis *= value; -#endif - bsdf_eval_mis(eval, value); + eval->diffuse *= value; + eval->glossy *= value; } ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) { -#ifdef __SHADOW_TRICKS__ - eval->sum_no_mis *= value; -#endif -#ifdef __PASSES__ - if (eval->use_light_pass) { - eval->diffuse *= value; - eval->glossy *= value; - eval->transmission *= value; - eval->volume *= value; - - /* skipping transparent, this function is used by for eval(), will be zero then */ - } - else - eval->diffuse *= value; -#else eval->diffuse *= value; -#endif + eval->glossy *= value; } ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval) { -#ifdef __PASSES__ - if (eval->use_light_pass) { - return eval->diffuse + eval->glossy + eval->transmission + eval->volume; - } - else -#endif - return eval->diffuse; + return eval->diffuse + eval->glossy; } -/* Path Radiance - * - * We accumulate different render passes separately. After summing at the end - * to get the combined result, it should be identical. We definite directly - * visible as the first non-transparent hit, while indirectly visible are the - * bounces after that. */ - -ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L) +ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval) { - /* clear all */ -#ifdef __PASSES__ - L->use_light_pass = kernel_data.film.use_light_pass; - - if (kernel_data.film.use_light_pass) { - L->indirect = zero_float3(); - L->direct_emission = zero_float3(); - - L->color_diffuse = zero_float3(); - L->color_glossy = zero_float3(); - L->color_transmission = zero_float3(); - - L->direct_diffuse = zero_float3(); - L->direct_glossy = zero_float3(); - L->direct_transmission = zero_float3(); - L->direct_volume = zero_float3(); - - L->indirect_diffuse = zero_float3(); - L->indirect_glossy = zero_float3(); - L->indirect_transmission = zero_float3(); - L->indirect_volume = zero_float3(); - - L->transparent = 0.0f; - L->emission = zero_float3(); - L->background = zero_float3(); - L->ao = zero_float3(); - L->shadow = zero_float3(); - L->mist = 0.0f; - - L->state.diffuse = zero_float3(); - L->state.glossy = zero_float3(); - L->state.transmission = zero_float3(); - L->state.volume = zero_float3(); - L->state.direct = zero_float3(); - } - else -#endif - { - L->transparent = 0.0f; - L->emission = zero_float3(); - } - -#ifdef __SHADOW_TRICKS__ - L->path_total = zero_float3(); - L->path_total_shaded = zero_float3(); - L->shadow_background_color = zero_float3(); - L->shadow_throughput = 0.0f; - L->shadow_transparency = 1.0f; - L->has_shadow_catcher = 0; -#endif - -#ifdef __DENOISING_FEATURES__ - L->denoising_normal = zero_float3(); - L->denoising_albedo = zero_float3(); - L->denoising_depth = 0.0f; -#endif + /* Ratio of diffuse and glossy to recover proportions for writing to render pass. + * We assume reflection, transmission and volume scatter to be exclusive. */ + return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy); } -ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg, - PathRadianceState *L_state, - ccl_addr_space float3 *throughput, - BsdfEval *bsdf_eval, - float bsdf_pdf, - int bounce, - int bsdf_label) -{ - float inverse_pdf = 1.0f / bsdf_pdf; - -#ifdef __PASSES__ - if (kernel_data.film.use_light_pass) { - if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) { - /* first on directly visible surface */ - float3 value = *throughput * inverse_pdf; - - L_state->diffuse = bsdf_eval->diffuse * value; - L_state->glossy = bsdf_eval->glossy * value; - L_state->transmission = bsdf_eval->transmission * value; - L_state->volume = bsdf_eval->volume * value; - - *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume; +/* -------------------------------------------------------------------- + * Clamping + * + * Clamping is done on a per-contribution basis so that we can write directly + * to render buffers instead of using per-thread memory, and to avoid the + * impact of clamping on other contributions. */ - L_state->direct = *throughput; - } - else { - /* transparent bounce before first hit, or indirectly visible through BSDF */ - float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf; - *throughput *= sum; - } +ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce) +{ +#ifdef __KERNEL_DEBUG_NAN__ + if (!isfinite3_safe(*L)) { + kernel_assert(!"Cycles sample with non-finite value detected"); } - else #endif - { - *throughput *= bsdf_eval->diffuse * inverse_pdf; - } -} + /* Make sure all components are finite, allowing the contribution to be usable by adaptive + * sampling convergence check, but also to make it so render result never causes issues with + * post-processing. */ + *L = ensure_finite3(*L); #ifdef __CLAMP_SAMPLE__ -ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce) -{ float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect : kernel_data.integrator.sample_clamp_direct; float sum = reduce_add(fabs(*L)); if (sum > limit) { *L *= limit / sum; } +#endif } -ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg, - float3 *L, - float3 *throughput, - int bounce) -{ - float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect : - kernel_data.integrator.sample_clamp_direct; +/* -------------------------------------------------------------------- + * Pass accumulation utilities. + */ - float sum = reduce_add(fabs(*L)); - if (sum > limit) { - float clamp_factor = limit / sum; - *L *= clamp_factor; - *throughput *= clamp_factor; - } +/* Get pointer to pixel in render buffer. */ +ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer( + INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer) +{ + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + return render_buffer + render_buffer_offset; } -#endif +/* -------------------------------------------------------------------- + * Adaptive sampling. + */ -ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 value) +ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS, + ccl_global float *ccl_restrict render_buffer, + int sample) { -#ifdef __SHADOW_TRICKS__ - if (state->flag & PATH_RAY_SHADOW_CATCHER) { - return; + if (kernel_data.film.pass_sample_count == PASS_UNUSED) { + return sample; } -#endif - float3 contribution = throughput * value; -#ifdef __CLAMP_SAMPLE__ - path_radiance_clamp(kg, &contribution, state->bounce - 1); -#endif + ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); -#ifdef __PASSES__ - if (L->use_light_pass) { - if (state->bounce == 0) - L->emission += contribution; - else if (state->bounce == 1) - L->direct_emission += contribution; - else - L->indirect += contribution; - } - else -#endif - { - L->emission += contribution; - } + return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1); } -ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 alpha, - float3 bsdf, - float3 ao) +ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS, + const float3 contribution, + ccl_global float *ccl_restrict buffer) { -#ifdef __PASSES__ - /* Store AO pass. */ - if (L->use_light_pass && state->bounce == 0) { - L->ao += alpha * throughput * ao; - } -#endif - -#ifdef __SHADOW_TRICKS__ - /* For shadow catcher, accumulate ratio. */ - if (state->flag & PATH_RAY_STORE_SHADOW_INFO) { - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; + /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping + * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte + * Carlo global illumination" except that here it is applied per pixel and not in hierarchical + * tiles. */ - if (state->flag & PATH_RAY_SHADOW_CATCHER) { - return; - } + if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) { + return; } -#endif - - float3 contribution = throughput * bsdf * ao; -#ifdef __PASSES__ - if (L->use_light_pass) { - if (state->bounce == 0) { - /* Directly visible lighting. */ - L->direct_diffuse += contribution; - } - else { - /* Indirectly visible lighting after BSDF bounce. */ - L->indirect += contribution; - } - } - else -#endif - { - L->emission += contribution; + const int sample = INTEGRATOR_STATE(path, sample); + if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) { + kernel_write_pass_float4( + buffer + kernel_data.film.pass_adaptive_aux_buffer, + make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f)); } } -ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 bsdf) -{ -#ifdef __SHADOW_TRICKS__ - if (state->flag & PATH_RAY_STORE_SHADOW_INFO) { - L->path_total += throughput * bsdf; - } -#else - (void)L; - (void)state; - (void)throughput; - (void)bsdf; -#endif -} +/* -------------------------------------------------------------------- + * Shadow catcher. + */ + +#ifdef __SHADOW_CATCHER__ -ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - BsdfEval *bsdf_eval, - float3 shadow, - float shadow_fac, - bool is_lamp) +/* Accumulate contribution to the Shadow Catcher pass. + * + * Returns truth if the contribution is fully handled here and is not to be added to the other + * passes (like combined, adaptive sampling). */ + +ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS, + const float3 contribution, + ccl_global float *ccl_restrict buffer) { -#ifdef __SHADOW_TRICKS__ - if (state->flag & PATH_RAY_STORE_SHADOW_INFO) { - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; - - if (state->flag & PATH_RAY_SHADOW_CATCHER) { - return; - } + if (!kernel_data.integrator.has_shadow_catcher) { + return false; } -#endif - float3 shaded_throughput = throughput * shadow; + kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED); + kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED); -#ifdef __PASSES__ - if (L->use_light_pass) { - /* Compute the clamping based on the total contribution. - * The resulting scale is then be applied to all individual components. */ - float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval); -# ifdef __CLAMP_SAMPLE__ - path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce); -# endif - - if (state->bounce == 0) { - /* directly visible lighting */ - L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse; - L->direct_glossy += shaded_throughput * bsdf_eval->glossy; - L->direct_transmission += shaded_throughput * bsdf_eval->transmission; - L->direct_volume += shaded_throughput * bsdf_eval->volume; - - if (is_lamp) { - L->shadow += shadow * shadow_fac; - } - } - else { - /* indirectly visible lighting after BSDF bounce */ - L->indirect += full_contribution; - } + /* Matte pass. */ + if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) { + kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution); + /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive + * sampling is based on how noisy the combined pass is as if there were no catchers in the + * scene. */ } - else -#endif - { - float3 contribution = shaded_throughput * bsdf_eval->diffuse; - path_radiance_clamp(kg, &contribution, state->bounce); - L->emission += contribution; + + /* Shadow catcher pass. */ + if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) { + kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution); + return true; } -} -ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - const BsdfEval *bsdf_eval) -{ -#ifdef __SHADOW_TRICKS__ - if (state->flag & PATH_RAY_STORE_SHADOW_INFO) { - L->path_total += throughput * bsdf_eval->sum_no_mis; - } -#else - (void)L; - (void)state; - (void)throughput; - (void)bsdf_eval; -#endif + return false; } -ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 value) +ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS, + const float3 contribution, + const float transparent, + ccl_global float *ccl_restrict buffer) { + if (!kernel_data.integrator.has_shadow_catcher) { + return false; + } -#ifdef __SHADOW_TRICKS__ - if (state->flag & PATH_RAY_STORE_SHADOW_INFO) { - L->path_total += throughput * value; - L->path_total_shaded += throughput * value * L->shadow_transparency; + kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED); + kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED); - if (state->flag & PATH_RAY_SHADOW_CATCHER) { - return; - } + if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) { + return true; } -#endif - float3 contribution = throughput * value; -#ifdef __CLAMP_SAMPLE__ - path_radiance_clamp(kg, &contribution, state->bounce - 1); -#endif + /* Matte pass. */ + if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) { + kernel_write_pass_float4( + buffer + kernel_data.film.pass_shadow_catcher_matte, + make_float4(contribution.x, contribution.y, contribution.z, transparent)); + /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive + * sampling is based on how noisy the combined pass is as if there were no catchers in the + * scene. */ + } -#ifdef __PASSES__ - if (L->use_light_pass) { - if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) - L->background += contribution; - else if (state->bounce == 1) - L->direct_emission += contribution; - else - L->indirect += contribution; - } - else -#endif - { - L->emission += contribution; + /* Shadow catcher pass. */ + if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) { + /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the + * calculation and the alpha channel of the pass contains numbers of samples contributed to a + * pixel of the pass. */ + kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution); + return true; } -#ifdef __DENOISING_FEATURES__ - L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput * - value; -#endif /* __DENOISING_FEATURES__ */ + return false; } -ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput) +ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS, + const float transparent, + ccl_global float *ccl_restrict buffer) { - L->transparent += average(throughput); -} + if (!kernel_data.integrator.has_shadow_catcher) { + return; + } -#ifdef __SHADOW_TRICKS__ -ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L, - float3 throughput, - float3 background) -{ - L->shadow_throughput += average(throughput); - L->shadow_background_color += throughput * background; - L->has_shadow_catcher = 1; -} -#endif + kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED); -ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) -{ -#ifdef __PASSES__ - /* this division is a bit ugly, but means we only have to keep track of - * only a single throughput further along the path, here we recover just - * the indirect path that is not influenced by any particular BSDF type */ - if (L->use_light_pass) { - L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct); - L->direct_diffuse += L->state.diffuse * L->direct_emission; - L->direct_glossy += L->state.glossy * L->direct_emission; - L->direct_transmission += L->state.transmission * L->direct_emission; - L->direct_volume += L->state.volume * L->direct_emission; - - L->indirect = safe_divide_color(L->indirect, L->state.direct); - L->indirect_diffuse += L->state.diffuse * L->indirect; - L->indirect_glossy += L->state.glossy * L->indirect; - L->indirect_transmission += L->state.transmission * L->indirect; - L->indirect_volume += L->state.volume * L->indirect; + /* Matte pass. */ + if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent); } -#endif } -ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) -{ -#ifdef __PASSES__ - if (L->use_light_pass) { - L->state.diffuse = zero_float3(); - L->state.glossy = zero_float3(); - L->state.transmission = zero_float3(); - L->state.volume = zero_float3(); +#endif /* __SHADOW_CATCHER__ */ + +/* -------------------------------------------------------------------- + * Render passes. + */ - L->direct_emission = zero_float3(); - L->indirect = zero_float3(); +/* Write combined pass. */ +ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS, + const float3 contribution, + ccl_global float *ccl_restrict buffer) +{ +#ifdef __SHADOW_CATCHER__ + if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) { + return; } #endif + + if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) { + kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution); + } + + kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer); } -ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src) +/* Write combined pass with transparency. */ +ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS, + const float3 contribution, + const float transparent, + ccl_global float *ccl_restrict + buffer) { -#ifdef __PASSES__ - if (L->use_light_pass) { - L->state = L_src->state; - - L->direct_emission = L_src->direct_emission; - L->indirect = L_src->indirect; +#ifdef __SHADOW_CATCHER__ + if (kernel_accum_shadow_catcher_transparent( + INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) { + return; } #endif + + if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) { + kernel_write_pass_float4( + buffer + kernel_data.film.pass_combined, + make_float4(contribution.x, contribution.y, contribution.z, transparent)); + } + + kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer); } -#ifdef __SHADOW_TRICKS__ -ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg, - PathRadiance *L, - float3 *L_sum, - float *alpha) +/* Write background or emission to appropriate pass. */ +ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS, + float3 contribution, + ccl_global float *ccl_restrict + buffer, + const int pass) { - /* Calculate current shadow of the path. */ - float path_total = average(L->path_total); - float shadow; + if (!(kernel_data.film.light_pass_flag & PASS_ANY)) { + return; + } - if (UNLIKELY(!isfinite_safe(path_total))) { -# ifdef __KERNEL_DEBUG_NAN__ - kernel_assert(!"Non-finite total radiance along the path"); -# endif - shadow = 0.0f; +#ifdef __PASSES__ + const int path_flag = INTEGRATOR_STATE(path, flag); + int pass_offset = PASS_UNUSED; + + /* Denoising albedo. */ +# ifdef __DENOISING_FEATURES__ + if (path_flag & PATH_RAY_DENOISING_FEATURES) { + if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) { + const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, + denoising_feature_throughput); + const float3 denoising_albedo = denoising_feature_throughput * contribution; + kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo); + } } - else if (path_total == 0.0f) { - shadow = L->shadow_transparency; +# endif /* __DENOISING_FEATURES__ */ + + if (!(path_flag & PATH_RAY_ANY_PASS)) { + /* Directly visible, write to emission or background pass. */ + pass_offset = pass; + } + else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) { + /* Indirectly visible through reflection. */ + const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ? + ((INTEGRATOR_STATE(path, bounce) == 1) ? + kernel_data.film.pass_glossy_direct : + kernel_data.film.pass_glossy_indirect) : + ((INTEGRATOR_STATE(path, bounce) == 1) ? + kernel_data.film.pass_transmission_direct : + kernel_data.film.pass_transmission_indirect); + + if (glossy_pass_offset != PASS_UNUSED) { + /* Glossy is a subset of the throughput, reconstruct it here using the + * diffuse-glossy ratio. */ + const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio); + const float3 glossy_contribution = (one_float3() - ratio) * contribution; + kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution); + } + + /* Reconstruct diffuse subset of throughput. */ + pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct : + kernel_data.film.pass_diffuse_indirect; + if (pass_offset != PASS_UNUSED) { + contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio); + } } - else { - float path_total_shaded = average(L->path_total_shaded); - shadow = path_total_shaded / path_total; + else if (path_flag & PATH_RAY_VOLUME_PASS) { + /* Indirectly visible through volume. */ + pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct : + kernel_data.film.pass_volume_indirect; } - /* Calculate final light sum and transparency for shadow catcher object. */ - if (kernel_data.background.transparent) { - *alpha -= L->shadow_throughput * shadow; - } - else { - L->shadow_background_color *= shadow; - *L_sum += L->shadow_background_color; + /* Single write call for GPU coherence. */ + if (pass_offset != PASS_UNUSED) { + kernel_write_pass_float3(buffer + pass_offset, contribution); } +#endif /* __PASSES__ */ } -#endif -ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, - PathRadiance *L, - float *alpha) +/* Write light contribution to render buffer. */ +ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS, + ccl_global float *ccl_restrict render_buffer) { - float3 L_sum; - /* Light Passes are used */ + /* The throughput for shadow paths already contains the light shader evaluation. */ + float3 contribution = INTEGRATOR_STATE(shadow_path, throughput); + kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1); + + ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); + + kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer); + #ifdef __PASSES__ - float3 L_direct, L_indirect; - if (L->use_light_pass) { - path_radiance_sum_indirect(L); - - L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume + - L->emission; - L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + - L->indirect_volume; - - if (!kernel_data.background.transparent) - L_direct += L->background; - - L_sum = L_direct + L_indirect; - float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - - /* Reject invalid value */ - if (!isfinite_safe(sum)) { -# ifdef __KERNEL_DEBUG_NAN__ - kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!"); -# endif - L_sum = zero_float3(); - - L->direct_diffuse = zero_float3(); - L->direct_glossy = zero_float3(); - L->direct_transmission = zero_float3(); - L->direct_volume = zero_float3(); - - L->indirect_diffuse = zero_float3(); - L->indirect_glossy = zero_float3(); - L->indirect_transmission = zero_float3(); - L->indirect_volume = zero_float3(); - - L->emission = zero_float3(); + if (kernel_data.film.light_pass_flag & PASS_ANY) { + const int path_flag = INTEGRATOR_STATE(shadow_path, flag); + int pass_offset = PASS_UNUSED; + + if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) { + /* Indirectly visible through reflection. */ + const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ? + ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ? + kernel_data.film.pass_glossy_direct : + kernel_data.film.pass_glossy_indirect) : + ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ? + kernel_data.film.pass_transmission_direct : + kernel_data.film.pass_transmission_indirect); + + if (glossy_pass_offset != PASS_UNUSED) { + /* Glossy is a subset of the throughput, reconstruct it here using the + * diffuse-glossy ratio. */ + const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio); + const float3 glossy_contribution = (one_float3() - ratio) * contribution; + kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution); + } + + /* Reconstruct diffuse subset of throughput. */ + pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ? + kernel_data.film.pass_diffuse_direct : + kernel_data.film.pass_diffuse_indirect; + if (pass_offset != PASS_UNUSED) { + contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio); + } + } + else if (path_flag & PATH_RAY_VOLUME_PASS) { + /* Indirectly visible through volume. */ + pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ? + kernel_data.film.pass_volume_direct : + kernel_data.film.pass_volume_indirect; } - } - /* No Light Passes */ - else -#endif - { - L_sum = L->emission; + /* Single write call for GPU coherence. */ + if (pass_offset != PASS_UNUSED) { + kernel_write_pass_float3(buffer + pass_offset, contribution); + } - /* Reject invalid value */ - float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if (!isfinite_safe(sum)) { -#ifdef __KERNEL_DEBUG_NAN__ - kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); -#endif - L_sum = zero_float3(); + /* Write shadow pass. */ + if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) && + (path_flag & PATH_RAY_CAMERA)) { + const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput); + const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput); + const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) * + kernel_data.film.pass_shadow_scale; + kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow); } } +#endif +} - /* Compute alpha. */ - *alpha = 1.0f - L->transparent; +/* Write transparency to render buffer. + * + * Note that we accumulate transparency = 1 - alpha in the render buffer. + * Otherwise we'd have to write alpha on path termination, which happens + * in many places. */ +ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS, + const float transparent, + ccl_global float *ccl_restrict render_buffer) +{ + ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); - /* Add shadow catcher contributions. */ -#ifdef __SHADOW_TRICKS__ - if (L->has_shadow_catcher) { - path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha); + if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent); } -#endif /* __SHADOW_TRICKS__ */ - return L_sum; + kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer); } -ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, - PathRadiance *L, - float3 *noisy, - float3 *clean) +/* Write background contribution to render buffer. + * + * Includes transparency, matching kernel_accum_transparent. */ +ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS, + const float3 L, + const float transparent, + const bool is_transparent_background_ray, + ccl_global float *ccl_restrict render_buffer) { -#ifdef __PASSES__ - kernel_assert(L->use_light_pass); - - *clean = L->emission + L->background; - *noisy = L->direct_volume + L->indirect_volume; - -# define ADD_COMPONENT(flag, component) \ - if (kernel_data.film.denoising_flags & flag) \ - *clean += component; \ - else \ - *noisy += component; - - ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); - ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); - ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); - ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); - ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); - ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); -# undef ADD_COMPONENT -#else - *noisy = L->emission; - *clean = zero_float3(); -#endif + float3 contribution = INTEGRATOR_STATE(path, throughput) * L; + kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1); -#ifdef __SHADOW_TRICKS__ - if (L->has_shadow_catcher) { - *noisy += L->shadow_background_color; - } -#endif + ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); - *noisy = ensure_finite3(*noisy); - *clean = ensure_finite3(*clean); + if (is_transparent_background_ray) { + kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer); + } + else { + kernel_accum_combined_transparent_pass( + INTEGRATOR_STATE_PASS, contribution, transparent, buffer); + } + kernel_accum_emission_or_background_pass( + INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background); } -ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample) +/* Write emission to render buffer. */ +ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS, + const float3 throughput, + const float3 L, + ccl_global float *ccl_restrict render_buffer) { -#ifdef __SPLIT_KERNEL__ -# define safe_float3_add(f, v) \ - do { \ - ccl_global float *p = (ccl_global float *)(&(f)); \ - atomic_add_and_fetch_float(p + 0, (v).x); \ - atomic_add_and_fetch_float(p + 1, (v).y); \ - atomic_add_and_fetch_float(p + 2, (v).z); \ - } while (0) -# define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v)) -#else -# define safe_float3_add(f, v) (f) += (v) -# define safe_float_add(f, v) (f) += (v) -#endif /* __SPLIT_KERNEL__ */ + float3 contribution = throughput * L; + kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1); -#ifdef __PASSES__ - safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse); - safe_float3_add(L->direct_glossy, L_sample->direct_glossy); - safe_float3_add(L->direct_transmission, L_sample->direct_transmission); - safe_float3_add(L->direct_volume, L_sample->direct_volume); - - safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse); - safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy); - safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission); - safe_float3_add(L->indirect_volume, L_sample->indirect_volume); - - safe_float3_add(L->background, L_sample->background); - safe_float3_add(L->ao, L_sample->ao); - safe_float3_add(L->shadow, L_sample->shadow); - safe_float_add(L->mist, L_sample->mist); -#endif /* __PASSES__ */ - safe_float3_add(L->emission, L_sample->emission); + ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, + render_buffer); -#undef safe_float_add -#undef safe_float3_add + kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer); + kernel_accum_emission_or_background_pass( + INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h index 98b7bf7e7dc..2bee12f0473 100644 --- a/intern/cycles/kernel/kernel_adaptive_sampling.h +++ b/intern/cycles/kernel/kernel_adaptive_sampling.h @@ -14,226 +14,146 @@ * limitations under the License. */ -#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__ -#define __KERNEL_ADAPTIVE_SAMPLING_H__ +#pragma once + +#include "kernel/kernel_write_passes.h" CCL_NAMESPACE_BEGIN -/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ +/* Check whether the pixel has converged and should not be sampled anymore. */ -ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg, - ccl_global float *buffer, - int sample) +ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS, + ccl_global float *render_buffer) { - /* TODO Stefan: Is this better in linear, sRGB or something else? */ - float4 I = *((ccl_global float4 *)buffer); - float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - /* The per pixel error as seen in section 2.1 of - * "A hierarchical automatic stopping condition for Monte Carlo global illumination" - * A small epsilon is added to the divisor to prevent division by zero. */ - float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) / - (sample * 0.0001f + sqrtf(I.x + I.y + I.z)); - if (error < kernel_data.integrator.adaptive_threshold * (float)sample) { - /* Set the fourth component to non-zero value to indicate that this pixel has converged. */ - buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f; + if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) { + return true; } -} - -/* Adjust the values of an adaptively sampled pixel. */ - -ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg, - ccl_global float *buffer, - float sample_multiplier) -{ - *(ccl_global float4 *)(buffer) *= sample_multiplier; - /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */ - kernel_assert(kernel_data.film.pass_adaptive_aux_buffer); - *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier; + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + render_buffer_offset; -#ifdef __PASSES__ - int flag = kernel_data.film.pass_flag; - - if (flag & PASSMASK(NORMAL)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + return buffer[aux_w_offset] == 0.0f; +} - if (flag & PASSMASK(UV)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier; +/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ - if (flag & PASSMASK(MOTION)) { - *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier; - *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier; +ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int y, + float threshold, + bool reset, + int offset, + int stride) +{ + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED); + + const int render_pixel_index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + + (uint64_t)render_pixel_index * kernel_data.film.pass_stride; + + /* TODO(Stefan): Is this better in linear, sRGB or something else? */ + + const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer); + if (!reset && A.w != 0.0f) { + /* If the pixel was considered converged, its state will not change in this kernmel. Early + * output before doing any math. + * + * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */ + return true; } - if (kernel_data.film.use_light_pass) { - int light_flag = kernel_data.film.light_pass_flag; - - if (light_flag & PASSMASK(MIST)) - *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier; - - /* Shadow pass omitted on purpose. It has its own scale parameter. */ - - if (light_flag & PASSMASK(DIFFUSE_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_INDIRECT)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(VOLUME_INDIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier; - if (light_flag & PASSMASK(DIFFUSE_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_DIRECT)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_direct) *= sample_multiplier; - if (light_flag & PASSMASK(VOLUME_DIRECT)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier; - - if (light_flag & PASSMASK(EMISSION)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier; - if (light_flag & PASSMASK(BACKGROUND)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier; - if (light_flag & PASSMASK(AO)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier; - - if (light_flag & PASSMASK(DIFFUSE_COLOR)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier; - if (light_flag & PASSMASK(GLOSSY_COLOR)) - *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier; - if (light_flag & PASSMASK(TRANSMISSION_COLOR)) - *(ccl_global float3 *)(buffer + - kernel_data.film.pass_transmission_color) *= sample_multiplier; - } -#endif - -#ifdef __DENOISING_FEATURES__ - -# define scale_float3_variance(buffer, offset, scale) \ - *(buffer + offset) *= scale; \ - *(buffer + offset + 1) *= scale; \ - *(buffer + offset + 2) *= scale; \ - *(buffer + offset + 3) *= scale * scale; \ - *(buffer + offset + 4) *= scale * scale; \ - *(buffer + offset + 5) *= scale * scale; - -# define scale_shadow_variance(buffer, offset, scale) \ - *(buffer + offset) *= scale; \ - *(buffer + offset + 1) *= scale; \ - *(buffer + offset + 2) *= scale * scale; - - if (kernel_data.film.pass_denoising_data) { - scale_shadow_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier); - scale_shadow_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier); - if (kernel_data.film.pass_denoising_clean) { - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); - *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier; - } - else { - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); - } - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier); - scale_float3_variance( - buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier); - *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier; - *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH + - 1) *= sample_multiplier * sample_multiplier; - } -#endif /* __DENOISING_FEATURES__ */ - - /* Cryptomatte. */ - if (kernel_data.film.cryptomatte_passes) { - int num_slots = 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0; - num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0; - num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth; - ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer + - kernel_data.film.pass_cryptomatte); - for (int slot = 0; slot < num_slots; slot++) { - id_buffer[slot].y *= sample_multiplier; - } - } + const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined); - /* AOVs. */ - for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) { - *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier; - } - for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) { - *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier; - } + const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]); + const float inv_sample = 1.0f / sample; + + /* The per pixel error as seen in section 2.1 of + * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */ + const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) * + inv_sample; + const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample); + /* A small epsilon is added to the divisor to prevent division by zero. */ + const float error = error_difference / (0.0001f + error_normalize); + const bool did_converge = (error < threshold); + + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + buffer[aux_w_offset] = did_converge; + + return did_converge; } /* This is a simple box filter in two passes. * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */ -ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile) +ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg, + ccl_global float *render_buffer, + int y, + int start_x, + int width, + int offset, + int stride) { - bool any = false; + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + bool prev = false; - for (int x = tile->x; x < tile->x + tile->w; ++x) { - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - any = true; - if (x > tile->x && !prev) { + for (int x = start_x; x < start_x + width; ++x) { + int index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + + if (buffer[aux_w_offset] == 0.0f) { + if (x > start_x && !prev) { index = index - 1; - buffer = tile->buffer + index * kernel_data.film.pass_stride; - aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - (*aux).w = 0.0f; + buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer[aux_w_offset] = 0.0f; } prev = true; } else { if (prev) { - (*aux).w = 0.0f; + buffer[aux_w_offset] = 0.0f; } prev = false; } } - return any; } -ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile) +ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg, + ccl_global float *render_buffer, + int x, + int start_y, + int height, + int offset, + int stride) { + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED); + bool prev = false; - bool any = false; - for (int y = tile->y; y < tile->y + tile->h; ++y) { - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - any = true; - if (y > tile->y && !prev) { - index = index - tile->stride; - buffer = tile->buffer + index * kernel_data.film.pass_stride; - aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); - (*aux).w = 0.0f; + for (int y = start_y; y < start_y + height; ++y) { + int index = offset + x + y * stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; + + if (buffer[aux_w_offset] == 0.0f) { + if (y > start_y && !prev) { + index = index - stride; + buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer[aux_w_offset] = 0.0f; } prev = true; } else { if (prev) { - (*aux).w = 0.0f; + buffer[aux_w_offset] = 0.0f; } prev = false; } } - return any; } CCL_NAMESPACE_END - -#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */ diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 7da890b908d..e025bcd6674 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -14,502 +14,62 @@ * limitations under the License. */ -CCL_NAMESPACE_BEGIN - -#ifdef __BAKING__ - -ccl_device_noinline void compute_light_pass( - KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample) -{ - kernel_assert(kernel_data.film.use_light_pass); - - float3 throughput = one_float3(); - - /* Emission and indirect shader data memory used by various functions. */ - ShaderDataTinyStorage emission_sd_storage; - ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); - ShaderData indirect_sd; - - /* Init radiance. */ - path_radiance_init(kg, L); - - /* Init path state. */ - PathState state; - path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL); - - /* Evaluate surface shader. */ - shader_eval_surface(kg, sd, &state, NULL, state.flag); - - /* TODO: disable more closures we don't need besides transparent. */ - shader_bsdf_disable_transparency(kg, sd); - - /* Init ray. */ - Ray ray; - ray.P = sd->P + sd->Ng; - ray.D = -sd->Ng; - ray.t = FLT_MAX; -# ifdef __CAMERA_MOTION__ - ray.time = 0.5f; -# endif - -# ifdef __BRANCHED_PATH__ - if (!kernel_data.integrator.branched) { - /* regular path tracer */ -# endif - - /* sample ambient occlusion */ - if (pass_filter & BAKE_FILTER_AO) { - kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd)); - } - - /* sample emission */ - if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { - float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(kg, L, &state, throughput, emission); - } - - bool is_sss_sample = false; - -# ifdef __SUBSURFACE__ - /* sample subsurface scattering */ - if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) { - /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting - * if scattering was successful. */ - SubsurfaceIndirectRays ss_indirect; - kernel_path_subsurface_init_indirect(&ss_indirect); - if (kernel_path_subsurface_scatter( - kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) { - while (ss_indirect.num_rays) { - kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput); - kernel_path_indirect( - kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object); - } - is_sss_sample = true; - } - } -# endif - - /* sample light and BSDF */ - if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) { - kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L); - - if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) { -# ifdef __LAMP_MIS__ - state.ray_t = 0.0f; -# endif - /* compute indirect light */ - kernel_path_indirect( - kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object); - - /* sum and reset indirect light pass variables for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - } - } -# ifdef __BRANCHED_PATH__ - } - else { - /* branched path tracer */ - - /* sample ambient occlusion */ - if (pass_filter & BAKE_FILTER_AO) { - kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput); - } - - /* sample emission */ - if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { - float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(kg, L, &state, throughput, emission); - } - -# ifdef __SUBSURFACE__ - /* sample subsurface scattering */ - if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) { - /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting - * if scattering was successful. */ - kernel_branched_path_subsurface_scatter( - kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput); - } -# endif - - /* sample light and BSDF */ - if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) { -# if defined(__EMISSION__) - /* direct light */ - if (kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light( - kg, sd, emission_sd, &state, throughput, 1.0f, L, all); - } -# endif - - /* indirect light */ - kernel_branched_path_surface_indirect_light( - kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L); - } - } -# endif -} - -/* this helps with AA but it's not the real solution as it does not AA the geometry - * but it's better than nothing, thus committed */ -ccl_device_inline float bake_clamp_mirror_repeat(float u, float max) -{ - /* use mirror repeat (like opengl texture) so that if the barycentric - * coordinate goes past the end of the triangle it is not always clamped - * to the same value, gives ugly patterns */ - u /= max; - float fu = floorf(u); - u = u - fu; - - return ((((int)fu) & 1) ? 1.0f - u : u) * max; -} - -ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg, - ShaderData *sd, - const ShaderEvalType type) -{ - switch (type) { - case SHADER_EVAL_DIFFUSE: - return shader_bsdf_diffuse(kg, sd); - case SHADER_EVAL_GLOSSY: - return shader_bsdf_glossy(kg, sd); - case SHADER_EVAL_TRANSMISSION: - return shader_bsdf_transmission(kg, sd); - default: - kernel_assert(!"Unknown bake type passed to BSDF evaluate"); - return zero_float3(); - } -} - -ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, - ShaderData *sd, - PathState *state, - float3 direct, - float3 indirect, - const ShaderEvalType type, - const int pass_filter) -{ - float3 color; - const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0; - const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0; - const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0; - float3 out = zero_float3(); - - if (is_color) { - if (is_direct || is_indirect) { - /* Leave direct and diffuse channel colored. */ - color = one_float3(); - } - else { - /* surface color of the pass only */ - shader_eval_surface(kg, sd, state, NULL, 0); - return kernel_bake_shader_bsdf(kg, sd, type); - } - } - else { - shader_eval_surface(kg, sd, state, NULL, 0); - color = kernel_bake_shader_bsdf(kg, sd, type); - } - - if (is_direct) { - out += safe_divide_even_color(direct, color); - } - - if (is_indirect) { - out += safe_divide_even_color(indirect, color); - } - - return out; -} - -ccl_device void kernel_bake_evaluate( - KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride) -{ - /* Setup render buffers. */ - const int index = offset + x + y * stride; - const int pass_stride = kernel_data.film.pass_stride; - buffer += index * pass_stride; - - ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive; - ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential; - ccl_global float *output = buffer + kernel_data.film.pass_combined; - - int seed = __float_as_uint(primitive[0]); - int prim = __float_as_uint(primitive[1]); - if (prim == -1) - return; - - prim += kernel_data.bake.tri_offset; - - /* Random number generator. */ - uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed; - int num_samples = kernel_data.integrator.aa_samples; - - float filter_x, filter_y; - if (sample == 0) { - filter_x = filter_y = 0.5f; - } - else { - path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); - } - - /* Barycentric UV with sub-pixel offset. */ - float u = primitive[2]; - float v = primitive[3]; - - float dudx = differential[0]; - float dudy = differential[1]; - float dvdx = differential[2]; - float dvdy = differential[3]; - - if (sample > 0) { - u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f); - v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f), - 1.0f - u); - } - - /* Shader data setup. */ - int object = kernel_data.bake.object_index; - int shader; - float3 P, Ng; - - triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); - - ShaderData sd; - shader_setup_from_sample( - kg, - &sd, - P, - Ng, - Ng, - shader, - object, - prim, - u, - v, - 1.0f, - 0.5f, - !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED), - LAMP_NONE); - sd.I = sd.N; - - /* Setup differentials. */ - sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx; - sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy; - sd.du.dx = dudx; - sd.du.dy = dudy; - sd.dv.dx = dvdx; - sd.dv.dy = dvdy; - - /* Set RNG state for shaders that use sampling. */ - PathState state = {0}; - state.rng_hash = rng_hash; - state.rng_offset = 0; - state.sample = sample; - state.num_samples = num_samples; - state.min_ray_pdf = FLT_MAX; - - /* Light passes if we need more than color. */ - PathRadiance L; - int pass_filter = kernel_data.bake.pass_filter; - - if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR) - compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample); - - float3 out = zero_float3(); - - ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type; - switch (type) { - /* data passes */ - case SHADER_EVAL_NORMAL: - case SHADER_EVAL_ROUGHNESS: - case SHADER_EVAL_EMISSION: { - if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) { - int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0; - shader_eval_surface(kg, &sd, &state, NULL, path_flag); - } - - if (type == SHADER_EVAL_NORMAL) { - float3 N = sd.N; - if (sd.flag & SD_HAS_BUMP) { - N = shader_bsdf_average_normal(kg, &sd); - } +#pragma once - /* encoding: normal = (2 * color) - 1 */ - out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); - } - else if (type == SHADER_EVAL_ROUGHNESS) { - float roughness = shader_bsdf_average_roughness(&sd); - out = make_float3(roughness, roughness, roughness); - } - else { - out = shader_emissive_eval(&sd); - } - break; - } - case SHADER_EVAL_UV: { - out = primitive_uv(kg, &sd); - break; - } -# ifdef __PASSES__ - /* light passes */ - case SHADER_EVAL_AO: { - out = L.ao; - break; - } - case SHADER_EVAL_COMBINED: { - if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) { - float alpha; - out = path_radiance_clamp_and_sum(kg, &L, &alpha); - break; - } +#include "kernel/kernel_differential.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_shader.h" - if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT) - out += L.direct_diffuse; - if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT) - out += L.indirect_diffuse; +#include "kernel/geom/geom.h" - if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT) - out += L.direct_glossy; - if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT) - out += L.indirect_glossy; - - if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT) - out += L.direct_transmission; - if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT) - out += L.indirect_transmission; - - if ((pass_filter & BAKE_FILTER_EMISSION) != 0) - out += L.emission; - - break; - } - case SHADER_EVAL_SHADOW: { - out = L.shadow; - break; - } - case SHADER_EVAL_DIFFUSE: { - out = kernel_bake_evaluate_direct_indirect( - kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter); - break; - } - case SHADER_EVAL_GLOSSY: { - out = kernel_bake_evaluate_direct_indirect( - kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter); - break; - } - case SHADER_EVAL_TRANSMISSION: { - out = kernel_bake_evaluate_direct_indirect( - kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter); - break; - } -# endif - - /* extra */ - case SHADER_EVAL_ENVIRONMENT: { - /* setup ray */ - Ray ray; - - ray.P = zero_float3(); - ray.D = normalize(P); - ray.t = 0.0f; -# ifdef __CAMERA_MOTION__ - ray.time = 0.5f; -# endif - -# ifdef __RAY_DIFFERENTIALS__ - ray.dD = differential3_zero(); - ray.dP = differential3_zero(); -# endif - - /* setup shader data */ - shader_setup_from_background(kg, &sd, &ray); - - /* evaluate */ - int path_flag = 0; /* we can't know which type of BSDF this is for */ - shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION); - out = shader_background_eval(&sd); - break; - } - default: { - /* no real shader, returning the position of the verts for debugging */ - out = normalize(P); - break; - } - } - - /* write output */ - const float4 result = make_float4(out.x, out.y, out.z, 1.0f); - kernel_write_pass_float4(output, result); -} - -#endif /* __BAKING__ */ +CCL_NAMESPACE_BEGIN -ccl_device void kernel_displace_evaluate(KernelGlobals *kg, - ccl_global uint4 *input, +ccl_device void kernel_displace_evaluate(const KernelGlobals *kg, + ccl_global const KernelShaderEvalInput *input, ccl_global float4 *output, - int i) + const int offset) { - ShaderData sd; - PathState state = {0}; - uint4 in = input[i]; + /* Setup shader data. */ + const KernelShaderEvalInput in = input[offset]; - /* setup shader data */ - int object = in.x; - int prim = in.y; - float u = __uint_as_float(in.z); - float v = __uint_as_float(in.w); - - shader_setup_from_displace(kg, &sd, object, prim, u, v); + ShaderData sd; + shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v); - /* evaluate */ - float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state); + /* Evaluate displacement shader. */ + const float3 P = sd.P; + shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd); float3 D = sd.P - P; object_inverse_dir_transform(kg, &sd, &D); - /* write output */ - output[i] += make_float4(D.x, D.y, D.z, 0.0f); + /* Write output. */ + output[offset] += make_float4(D.x, D.y, D.z, 0.0f); } -ccl_device void kernel_background_evaluate(KernelGlobals *kg, - ccl_global uint4 *input, +ccl_device void kernel_background_evaluate(const KernelGlobals *kg, + ccl_global const KernelShaderEvalInput *input, ccl_global float4 *output, - int i) + const int offset) { - ShaderData sd; - PathState state = {0}; - uint4 in = input[i]; - - /* setup ray */ - Ray ray; - float u = __uint_as_float(in.x); - float v = __uint_as_float(in.y); - - ray.P = zero_float3(); - ray.D = equirectangular_to_direction(u, v); - ray.t = 0.0f; -#ifdef __CAMERA_MOTION__ - ray.time = 0.5f; -#endif + /* Setup ray */ + const KernelShaderEvalInput in = input[offset]; + const float3 ray_P = zero_float3(); + const float3 ray_D = equirectangular_to_direction(in.u, in.v); + const float ray_time = 0.5f; -#ifdef __RAY_DIFFERENTIALS__ - ray.dD = differential3_zero(); - ray.dP = differential3_zero(); -#endif - - /* setup shader data */ - shader_setup_from_background(kg, &sd, &ray); + /* Setup shader data. */ + ShaderData sd; + shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time); - /* evaluate */ - int path_flag = 0; /* we can't know which type of BSDF this is for */ - shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION); - float3 color = shader_background_eval(&sd); + /* Evaluate shader. + * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */ + const int path_flag = PATH_RAY_EMISSION; + shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>( + INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag); + const float3 color = shader_background_eval(&sd); - /* write output */ - output[i] += make_float4(color.x, color.y, color.z, 0.0f); + /* Write output. */ + output[offset] += make_float4(color.x, color.y, color.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index 1bfac37158d..7be5da8fe6d 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -14,6 +14,13 @@ * limitations under the License. */ +#pragma once + +#include "kernel_differential.h" +#include "kernel_lookup_table.h" +#include "kernel_montecarlo.h" +#include "kernel_projection.h" + CCL_NAMESPACE_BEGIN /* Perspective Camera */ @@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u return bokeh; } -ccl_device void camera_sample_perspective(KernelGlobals *kg, +ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg, float raster_x, float raster_y, float lens_u, @@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, #ifdef __RAY_DIFFERENTIALS__ float3 Dcenter = transform_direction(&cameratoworld, Pcamera); - - ray->dP = differential3_zero(); - ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter); - ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter); + float3 Dcenter_normalized = normalize(Dcenter); + + /* TODO: can this be optimized to give compact differentials directly? */ + ray->dP = differential_zero_compact(); + differential3 dD; + dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized; + dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized; + ray->dD = differential_make_compact(dD); #endif } else { @@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, Dx = normalize(transform_direction(&cameratoworld, Dx)); spherical_stereo_transform(&kernel_data.cam, &Px, &Dx); - ray->dP.dx = Px - Pcenter; - ray->dD.dx = Dx - Dcenter; + differential3 dP, dD; + + dP.dx = Px - Pcenter; + dD.dx = Dx - Dcenter; float3 Py = Pnostereo; float3 Dy = transform_perspective(&rastertocamera, @@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, Dy = normalize(transform_direction(&cameratoworld, Dy)); spherical_stereo_transform(&kernel_data.cam, &Py, &Dy); - ray->dP.dy = Py - Pcenter; - ray->dD.dy = Dy - Dcenter; + dP.dy = Py - Pcenter; + dD.dy = Dy - Dcenter; + ray->dD = differential_make_compact(dD); + ray->dP = differential_make_compact(dP); #endif } @@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float z_inv = 1.0f / normalize(Pcamera).z; float nearclip = kernel_data.cam.nearclip * z_inv; ray->P += nearclip * ray->D; - ray->dP.dx += nearclip * ray->dD.dx; - ray->dP.dy += nearclip * ray->dD.dy; + ray->dP += nearclip * ray->dD; ray->t = kernel_data.cam.cliplength * z_inv; #else ray->t = FLT_MAX; @@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, } /* Orthographic Camera */ -ccl_device void camera_sample_orthographic(KernelGlobals *kg, +ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg, float raster_x, float raster_y, float lens_u, @@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, #ifdef __RAY_DIFFERENTIALS__ /* ray differential */ - ray->dP.dx = float4_to_float3(kernel_data.cam.dx); - ray->dP.dy = float4_to_float3(kernel_data.cam.dy); + differential3 dP; + dP.dx = float4_to_float3(kernel_data.cam.dx); + dP.dy = float4_to_float3(kernel_data.cam.dx); - ray->dD = differential3_zero(); + ray->dP = differential_make_compact(dP); + ray->dD = differential_zero_compact(); #endif #ifdef __CAMERA_CLIPPING__ @@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, spherical_stereo_transform(cam, &Px, &Dx); } - ray->dP.dx = Px - Pcenter; - ray->dD.dx = Dx - Dcenter; + differential3 dP, dD; + dP.dx = Px - Pcenter; + dD.dx = Dx - Dcenter; float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f)); float3 Dy = panorama_to_direction(cam, Py.x, Py.y); @@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, spherical_stereo_transform(cam, &Py, &Dy); } - ray->dP.dy = Py - Pcenter; - ray->dD.dy = Dy - Dcenter; + dP.dy = Py - Pcenter; + dD.dy = Dy - Dcenter; + ray->dD = differential_make_compact(dD); + ray->dP = differential_make_compact(dP); #endif #ifdef __CAMERA_CLIPPING__ /* clipping */ float nearclip = cam->nearclip; ray->P += nearclip * ray->D; - ray->dP.dx += nearclip * ray->dD.dx; - ray->dP.dy += nearclip * ray->dD.dy; + ray->dP += nearclip * ray->dD; ray->t = cam->cliplength; #else ray->t = FLT_MAX; @@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, /* Common */ -ccl_device_inline void camera_sample(KernelGlobals *kg, +ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg, int x, int y, float filter_u, @@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg, /* Utilities */ -ccl_device_inline float3 camera_position(KernelGlobals *kg) +ccl_device_inline float3 camera_position(const KernelGlobals *kg) { Transform cameratoworld = kernel_data.cam.cameratoworld; return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w); } -ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P) +ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P) { Transform cameratoworld = kernel_data.cam.cameratoworld; float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w); @@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P) } } -ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P) +ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P) { if (kernel_data.cam.type != CAMERA_PANORAMA) { Transform worldtocamera = kernel_data.cam.worldtocamera; @@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P) } } -ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P) +ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P) { Transform cameratoworld = kernel_data.cam.cameratoworld; @@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P } } -ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P) +ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P) { if (kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h index 5eb1bdad02e..960774e0741 100644 --- a/intern/cycles/kernel/kernel_color.h +++ b/intern/cycles/kernel/kernel_color.h @@ -14,25 +14,22 @@ * limitations under the License. */ -#ifndef __KERNEL_COLOR_H__ -#define __KERNEL_COLOR_H__ +#pragma once #include "util/util_color.h" CCL_NAMESPACE_BEGIN -ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz) +ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz) { return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz), dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz), dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz)); } -ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c) +ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c) { return dot(c, float4_to_float3(kernel_data.film.rgb_to_y)); } CCL_NAMESPACE_END - -#endif /* __KERNEL_COLOR_H__ */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h deleted file mode 100644 index 4a9304a134c..00000000000 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KERNEL_COMPAT_OPENCL_H__ -#define __KERNEL_COMPAT_OPENCL_H__ - -#define __KERNEL_GPU__ -#define __KERNEL_OPENCL__ - -/* no namespaces in opencl */ -#define CCL_NAMESPACE_BEGIN -#define CCL_NAMESPACE_END - -#ifdef __CL_NOINLINE__ -# define ccl_noinline __attribute__((noinline)) -#else -# define ccl_noinline -#endif - -/* in opencl all functions are device functions, so leave this empty */ -#define ccl_device -#define ccl_device_inline ccl_device -#define ccl_device_forceinline ccl_device -#define ccl_device_noinline ccl_device ccl_noinline -#define ccl_device_noinline_cpu ccl_device -#define ccl_may_alias -#define ccl_static_constant static __constant -#define ccl_constant __constant -#define ccl_global __global -#define ccl_local __local -#define ccl_local_param __local -#define ccl_private __private -#define ccl_restrict restrict -#define ccl_ref -#define ccl_align(n) __attribute__((aligned(n))) -#define ccl_optional_struct_init - -#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION) -# define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1))) -#else -# define ccl_loop_no_unroll -#endif - -#ifdef __SPLIT_KERNEL__ -# define ccl_addr_space __global -#else -# define ccl_addr_space -#endif - -#define ATTR_FALLTHROUGH - -#define ccl_local_id(d) get_local_id(d) -#define ccl_global_id(d) get_global_id(d) - -#define ccl_local_size(d) get_local_size(d) -#define ccl_global_size(d) get_global_size(d) - -#define ccl_group_id(d) get_group_id(d) -#define ccl_num_groups(d) get_num_groups(d) - -/* Selective nodes compilation. */ -#ifndef __NODES_MAX_GROUP__ -# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX -#endif -#ifndef __NODES_FEATURES__ -# define __NODES_FEATURES__ NODE_FEATURE_ALL -#endif - -/* no assert in opencl */ -#define kernel_assert(cond) - -/* make_type definitions with opencl style element initializers */ -#ifdef make_float2 -# undef make_float2 -#endif -#ifdef make_float3 -# undef make_float3 -#endif -#ifdef make_float4 -# undef make_float4 -#endif -#ifdef make_int2 -# undef make_int2 -#endif -#ifdef make_int3 -# undef make_int3 -#endif -#ifdef make_int4 -# undef make_int4 -#endif -#ifdef make_uchar4 -# undef make_uchar4 -#endif - -#define make_float2(x, y) ((float2)(x, y)) -#define make_float3(x, y, z) ((float3)(x, y, z)) -#define make_float4(x, y, z, w) ((float4)(x, y, z, w)) -#define make_int2(x, y) ((int2)(x, y)) -#define make_int3(x, y, z) ((int3)(x, y, z)) -#define make_int4(x, y, z, w) ((int4)(x, y, z, w)) -#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w)) - -/* math functions */ -#define __uint_as_float(x) as_float(x) -#define __float_as_uint(x) as_uint(x) -#define __int_as_float(x) as_float(x) -#define __float_as_int(x) as_int(x) -#define powf(x, y) pow(((float)(x)), ((float)(y))) -#define fabsf(x) fabs(((float)(x))) -#define copysignf(x, y) copysign(((float)(x)), ((float)(y))) -#define asinf(x) asin(((float)(x))) -#define acosf(x) acos(((float)(x))) -#define atanf(x) atan(((float)(x))) -#define floorf(x) floor(((float)(x))) -#define ceilf(x) ceil(((float)(x))) -#define hypotf(x, y) hypot(((float)(x)), ((float)(y))) -#define atan2f(x, y) atan2(((float)(x)), ((float)(y))) -#define fmaxf(x, y) fmax(((float)(x)), ((float)(y))) -#define fminf(x, y) fmin(((float)(x)), ((float)(y))) -#define fmodf(x, y) fmod((float)(x), (float)(y)) -#define sinhf(x) sinh(((float)(x))) -#define coshf(x) cosh(((float)(x))) -#define tanhf(x) tanh(((float)(x))) - -/* Use native functions with possibly lower precision for performance, - * no issues found so far. */ -#if 1 -# define sinf(x) native_sin(((float)(x))) -# define cosf(x) native_cos(((float)(x))) -# define tanf(x) native_tan(((float)(x))) -# define expf(x) native_exp(((float)(x))) -# define sqrtf(x) native_sqrt(((float)(x))) -# define logf(x) native_log(((float)(x))) -# define rcp(x) native_recip(x) -#else -# define sinf(x) sin(((float)(x))) -# define cosf(x) cos(((float)(x))) -# define tanf(x) tan(((float)(x))) -# define expf(x) exp(((float)(x))) -# define sqrtf(x) sqrt(((float)(x))) -# define logf(x) log(((float)(x))) -# define rcp(x) recip(x) -#endif - -/* data lookup defines */ -#define kernel_data (*kg->data) -#define kernel_tex_array(tex) \ - ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data)) -#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)] - -/* define NULL */ -#ifndef NULL -# define NULL ((void *)0) -#endif - -/* enable extensions */ -#ifdef __KERNEL_CL_KHR_FP16__ -# pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif - -#include "util/util_half.h" -#include "util/util_types.h" - -#endif /* __KERNEL_COMPAT_OPENCL_H__ */ diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h index 3ec0cdbaccc..db4e110bd10 100644 --- a/intern/cycles/kernel/kernel_differential.h +++ b/intern/cycles/kernel/kernel_differential.h @@ -14,26 +14,28 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN /* See "Tracing Ray Differentials", Homan Igehy, 1999. */ -ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, - const differential3 dP, - float3 D, - const differential3 dD, - float3 Ng, - float t) +ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP, + const differential3 ray_dP, + float3 ray_D, + const differential3 ray_dD, + float3 surface_Ng, + float ray_t) { /* ray differential transfer through homogeneous medium, to * compute dPdx/dy at a shading point from the incoming ray */ - float3 tmp = D / dot(D, Ng); - float3 tmpx = dP.dx + t * dD.dx; - float3 tmpy = dP.dy + t * dD.dy; + float3 tmp = ray_D / dot(ray_D, surface_Ng); + float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx; + float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy; - dP_->dx = tmpx - dot(tmpx, Ng) * tmp; - dP_->dy = tmpy - dot(tmpy, Ng) * tmp; + surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp; + surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp; } ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD) @@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero() return d; } +/* Compact ray differentials that are just a scale to reduce memory usage and + * access cost in GPU. + * + * See above for more accurate reference implementations. + * + * TODO: also store the more compact version in ShaderData and recompute where + * needed? */ + +ccl_device_forceinline float differential_zero_compact() +{ + return 0.0f; +} + +ccl_device_forceinline float differential_make_compact(const differential3 D) +{ + return 0.5f * (len(D.dx) + len(D.dy)); +} + +ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP, + const float ray_dP, + const float3 /* ray_D */, + const float ray_dD, + const float3 surface_Ng, + const float ray_t) +{ + /* ray differential transfer through homogeneous medium, to + * compute dPdx/dy at a shading point from the incoming ray */ + float scale = ray_dP + ray_t * ray_dD; + + float3 dx, dy; + make_orthonormals(surface_Ng, &dx, &dy); + surface_dP->dx = dx * scale; + surface_dP->dy = dy * scale; +} + +ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI, + const float3 D, + const float dD) +{ + /* compute dIdx/dy at a shading point, we just need to negate the + * differential of the ray direction */ + + float3 dx, dy; + make_orthonormals(D, &dx, &dy); + + dI->dx = dD * dx; + dI->dy = dD * dy; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index aebf2ec8e28..d62285d173d 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -14,40 +14,36 @@ * limitations under the License. */ +#pragma once + +#include "kernel/kernel_light.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shader.h" + CCL_NAMESPACE_BEGIN -/* Direction Emission */ -ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg, - ShaderData *emission_sd, - LightSample *ls, - ccl_addr_space PathState *state, - float3 I, - differential3 dI, - float t, - float time) +/* Evaluate shader on light. */ +ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS, + ShaderData *ccl_restrict emission_sd, + LightSample *ccl_restrict ls, + float time) { /* setup shading at emitter */ float3 eval = zero_float3(); if (shader_constant_emission_eval(kg, ls->shader, &eval)) { - if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) { + if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) { ls->Ng = -ls->Ng; } } else { /* Setup shader data and call shader_eval_surface once, better * for GPU coherence and compile times. */ + PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP); #ifdef __BACKGROUND_MIS__ if (ls->type == LIGHT_BACKGROUND) { - Ray ray; - ray.D = ls->D; - ray.P = ls->P; - ray.t = 1.0f; - ray.time = time; - ray.dP = differential3_zero(); - ray.dD = dI; - - shader_setup_from_background(kg, emission_sd, &ray); + shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time); } else #endif @@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg, emission_sd, ls->P, ls->Ng, - I, + -ls->D, ls->shader, ls->object, ls->prim, ls->u, ls->v, - t, + ls->t, time, false, ls->lamp); @@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg, ls->Ng = emission_sd->Ng; } + PROFILING_SHADER(emission_sd->object, emission_sd->shader); + PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL); + /* No proper path flag, we're evaluating this for all closures. that's * weak but we'd have to do multiple evaluations otherwise. */ - path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION); - path_state_modify_bounce(state, false); + shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>( + INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION); /* Evaluate closures. */ #ifdef __BACKGROUND_MIS__ @@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg, return eval; } -ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - LightSample *ls, - ccl_addr_space PathState *state, - Ray *ray, - BsdfEval *eval, - bool *is_lamp, - float rand_terminate) +/* Test if light sample is from a light or emission from geometry. */ +ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls) { - if (ls->pdf == 0.0f) - return false; - - /* todo: implement */ - differential3 dD = differential3_zero(); + /* return if it's a lamp for shadow pass */ + return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND); +} - /* evaluate closure */ +/* Early path termination of shadow rays. */ +ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg, + const LightSample *ccl_restrict ls, + BsdfEval *ccl_restrict eval, + const float rand_terminate) +{ + if (bsdf_eval_is_zero(eval)) { + return true; + } - float3 light_eval = direct_emissive_eval( - kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time); + if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) { + float probability = max3(fabs(bsdf_eval_sum(eval))) * + kernel_data.integrator.light_inv_rr_threshold; + if (probability < 1.0f) { + if (rand_terminate >= probability) { + return true; + } + bsdf_eval_mul(eval, 1.0f / probability); + } + } - if (is_zero(light_eval)) - return false; + return false; +} - /* evaluate BSDF at shading point */ +/* This function should be used to compute a modified ray start position for + * rays leaving from a surface. The algorithm slightly distorts flat surface + * of a triangle. Surface is lifted by amount h along normal n in the incident + * point. */ -#ifdef __VOLUME__ - if (sd->prim != PRIM_NONE) - shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); +ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg, + const ShaderData *ccl_restrict sd, + float3 Ng) +{ + float3 V[3], N[3]; + triangle_vertices_and_normals(kg, sd->prim, V, N); + + const float u = sd->u, v = sd->v; + const float w = 1 - u - v; + float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */ + float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */ + + object_normal_transform(kg, sd, &n); /* Normal x scale, world space */ + + /* Parabolic approximation */ + float a = dot(N[2] - N[0], V[0] - V[2]); + float b = dot(N[2] - N[1], V[1] - V[2]); + float c = dot(N[1] - N[0], V[1] - V[0]); + float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1); + + /* Check flipped normals */ + if (dot(n, Ng) > 0) { + /* Local linear envelope */ + float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f); + float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f); + float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f); + h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f); + h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f); + h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f); + h = max(min(min(h0, h1), h2), h * 0.5f); + } else { - float bsdf_pdf; - shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf); - if (ls->shader & SHADER_USE_MIS) { - /* Multiple importance sampling. */ - float mis_weight = power_heuristic(ls->pdf, bsdf_pdf); - light_eval *= mis_weight; - } + float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f); + float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f); + float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f); + h0 = max(dot(P - V[0], N[0]) + h0, 0.0f); + h1 = max(dot(P - V[1], N[1]) + h1, 0.0f); + h2 = max(dot(P - V[2], N[2]) + h2, 0.0f); + h = min(-min(min(h0, h1), h2), h * 0.5f); } -#else - shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); -#endif - bsdf_eval_mul3(eval, light_eval / ls->pdf); - -#ifdef __PASSES__ - /* use visibility flag to skip lights */ - if (ls->shader & SHADER_EXCLUDE_ANY) { - if (ls->shader & SHADER_EXCLUDE_DIFFUSE) - eval->diffuse = zero_float3(); - if (ls->shader & SHADER_EXCLUDE_GLOSSY) - eval->glossy = zero_float3(); - if (ls->shader & SHADER_EXCLUDE_TRANSMIT) - eval->transmission = zero_float3(); - if (ls->shader & SHADER_EXCLUDE_SCATTER) - eval->volume = zero_float3(); - } -#endif + return n * h; +} - if (bsdf_eval_is_zero(eval)) - return false; +/* Ray offset to avoid shadow terminator artifact. */ - if (kernel_data.integrator.light_inv_rr_threshold > 0.0f -#ifdef __SHADOW_TRICKS__ - && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0 -#endif - ) { - float probability = max3(fabs(bsdf_eval_sum(eval))) * - kernel_data.integrator.light_inv_rr_threshold; - if (probability < 1.0f) { - if (rand_terminate >= probability) { - return false; +ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg, + const ShaderData *ccl_restrict sd, + float3 L) +{ + float NL = dot(sd->N, L); + bool transmit = (NL < 0.0f); + float3 Ng = (transmit ? -sd->Ng : sd->Ng); + float3 P = ray_offset(sd->P, Ng); + + if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) { + const float offset_cutoff = + kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset; + /* Do ray offset (heavy stuff) only for close to be terminated triangles: + * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also + * make a smooth transition near the threshold. */ + if (offset_cutoff > 0.0f) { + float NgL = dot(Ng, L); + float offset_amount = 0.0f; + if (NL < offset_cutoff) { + offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f); + } + else { + offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f); + } + if (offset_amount > 0.0f) { + P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount; } - bsdf_eval_mul(eval, 1.0f / probability); } } + return P; +} + +ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd, + const LightSample *ccl_restrict ls, + const float3 P, + Ray *ray) +{ if (ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - ray->P = ray_offset_shadow(kg, sd, ls->D); + ray->P = P; if (ls->t == FLT_MAX) { /* distant light */ @@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg, } else { /* other lights, avoid self-intersection */ - ray->D = ray_offset(ls->P, ls->Ng) - ray->P; + ray->D = ray_offset(ls->P, ls->Ng) - P; ray->D = normalize_len(ray->D, &ray->t); } - - ray->dP = sd->dP; - ray->dD = differential3_zero(); } else { /* signal to not cast shadow ray */ + ray->P = zero_float3(); + ray->D = zero_float3(); ray->t = 0.0f; } - /* return if it's a lamp for shadow pass */ - *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND); - - return true; + ray->dP = differential_make_compact(sd->dP); + ray->dD = differential_zero_compact(); + ray->time = sd->time; } -/* Indirect Primitive Emission */ - -ccl_device_noinline_cpu float3 indirect_primitive_emission( - KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf) +/* Create shadow ray towards light sample. */ +ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg, + const ShaderData *ccl_restrict sd, + const LightSample *ccl_restrict ls, + Ray *ray) { - /* evaluate emissive closure */ - float3 L = shader_emissive_eval(sd); - -#ifdef __HAIR__ - if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && - (sd->type & PRIMITIVE_ALL_TRIANGLE)) -#else - if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) -#endif - { - /* multiple importance sampling, get triangle light pdf, - * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, sd, t); - float mis_weight = power_heuristic(bsdf_pdf, pdf); - - return L * mis_weight; - } - - return L; + const float3 P = shadow_ray_offset(kg, sd, ls->D); + shadow_ray_setup(sd, ls, P, ray); } -/* Indirect Lamp Emission */ - -ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg, - ShaderData *emission_sd, - ccl_addr_space PathState *state, - PathRadiance *L, - Ray *ray, - float3 throughput) +/* Create shadow ray towards light sample. */ +ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg, + const ShaderData *ccl_restrict sd, + const LightSample *ccl_restrict ls, + const float3 P, + Ray *ray) { - for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) { - LightSample ls ccl_optional_struct_init; - - if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls)) - continue; - -#ifdef __PASSES__ - /* use visibility flag to skip lights */ - if (ls.shader & SHADER_EXCLUDE_ANY) { - if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || - ((ls.shader & SHADER_EXCLUDE_GLOSSY) && - ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) == - (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) || - ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || - ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) - continue; - } -#endif - - float3 lamp_L = direct_emissive_eval( - kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time); - -#ifdef __VOLUME__ - if (state->volume_stack[0].shader != SHADER_NONE) { - /* shadow attenuation */ - Ray volume_ray = *ray; - volume_ray.t = ls.t; - float3 volume_tp = one_float3(); - kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp); - lamp_L *= volume_tp; - } -#endif - - if (!(state->flag & PATH_RAY_MIS_SKIP)) { - /* multiple importance sampling, get regular light pdf, - * and compute weight with respect to BSDF pdf */ - float mis_weight = power_heuristic(state->ray_pdf, ls.pdf); - lamp_L *= mis_weight; - } - - path_radiance_accum_emission(kg, L, state, throughput, lamp_L); - } -} - -/* Indirect Background */ - -ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, - ShaderData *emission_sd, - ccl_addr_space PathState *state, - ccl_global float *buffer, - ccl_addr_space Ray *ray) -{ -#ifdef __BACKGROUND__ - int shader = kernel_data.background.surface_shader; - - /* Use visibility flag to skip lights. */ - if (shader & SHADER_EXCLUDE_ANY) { - if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) || - ((shader & SHADER_EXCLUDE_GLOSSY) && - ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) == - (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) || - ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) || - ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) || - ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER))) - return zero_float3(); - } - - /* Evaluate background shader. */ - float3 L = zero_float3(); - if (!shader_constant_emission_eval(kg, shader, &L)) { -# ifdef __SPLIT_KERNEL__ - Ray priv_ray = *ray; - shader_setup_from_background(kg, emission_sd, &priv_ray); -# else - shader_setup_from_background(kg, emission_sd, ray); -# endif - - path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION); - path_state_modify_bounce(state, false); - - L = shader_background_eval(emission_sd); - } - - /* Background MIS weights. */ -# ifdef __BACKGROUND_MIS__ - /* Check if background light exists or if we should skip pdf. */ - if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) { - /* multiple importance sampling, get background light pdf for ray - * direction, and compute weight with respect to BSDF pdf */ - float pdf = background_light_pdf(kg, ray->P, ray->D); - float mis_weight = power_heuristic(state->ray_pdf, pdf); - - return L * mis_weight; - } -# endif - - return L; -#else - return make_float3(0.8f, 0.8f, 0.8f); -#endif + shadow_ray_setup(sd, ls, P, ray); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h index a6fd4f1dc7e..fa93f4830d1 100644 --- a/intern/cycles/kernel/kernel_film.h +++ b/intern/cycles/kernel/kernel_film.h @@ -14,119 +14,516 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN -ccl_device float4 film_get_pass_result(KernelGlobals *kg, - ccl_global float *buffer, - float sample_scale, - int index, - bool use_display_sample_scale) -{ - float4 pass_result; - - int display_pass_stride = kernel_data.film.display_pass_stride; - int display_pass_components = kernel_data.film.display_pass_components; - - if (display_pass_components == 4) { - float4 in = *(ccl_global float4 *)(buffer + display_pass_stride + - index * kernel_data.film.pass_stride); - float alpha = use_display_sample_scale ? - (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) : - 1.0f; - - pass_result = make_float4(in.x, in.y, in.z, alpha); - - int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride; - if (display_divide_pass_stride != -1) { - ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride + - index * kernel_data.film.pass_stride); - float3 divided = safe_divide_even_color(float4_to_float3(pass_result), - float4_to_float3(*divide_in)); - pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w); - } +/* -------------------------------------------------------------------- + * Common utilities. + */ - if (kernel_data.film.use_display_exposure) { - float exposure = kernel_data.film.exposure; - pass_result *= make_float4(exposure, exposure, exposure, 1.0f); - } +/* The input buffer contains transparency = 1 - alpha, this converts it to + * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian + * roulette. */ +ccl_device_forceinline float film_transparency_to_alpha(float transparency) +{ + return saturate(1.0f - transparency); +} + +ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer) +{ + if (kfilm_convert->pass_sample_count == PASS_UNUSED) { + return kfilm_convert->scale; + } + + if (kfilm_convert->pass_use_filter) { + const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count)); + return 1.0f / sample_count; + } + + return 1.0f; +} + +ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer) +{ + if (kfilm_convert->pass_sample_count == PASS_UNUSED) { + return kfilm_convert->scale_exposure; + } + + const float scale = film_get_scale(kfilm_convert, buffer); + + if (kfilm_convert->pass_use_exposure) { + return scale * kfilm_convert->exposure; + } + + return scale; +} + +ccl_device_inline bool film_get_scale_and_scale_exposure( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict scale, + float *ccl_restrict scale_exposure) +{ + if (kfilm_convert->pass_sample_count == PASS_UNUSED) { + *scale = kfilm_convert->scale; + *scale_exposure = kfilm_convert->scale_exposure; + return true; + } + + const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count)); + if (!sample_count) { + *scale = 0.0f; + *scale_exposure = 0.0f; + return false; + } + + if (kfilm_convert->pass_use_filter) { + *scale = 1.0f / sample_count; } - else if (display_pass_components == 1) { - ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride + - index * kernel_data.film.pass_stride); - pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale); + else { + *scale = 1.0f; + } + + if (kfilm_convert->pass_use_exposure) { + *scale_exposure = *scale * kfilm_convert->exposure; + } + else { + *scale_exposure = *scale; + } + + return true; +} + +/* -------------------------------------------------------------------- + * Float (scalar) passes. + */ + +ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 1); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer); + + const float *in = buffer + kfilm_convert->pass_offset; + const float f = *in; + + pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure; +} + +ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 1); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer); + + const float *in = buffer + kfilm_convert->pass_offset; + const float f = *in; + + /* Note that we accumulate 1 - mist in the kernel to avoid having to + * track the mist values in the integrator state. */ + pixel[0] = saturate(1.0f - f * scale_exposure); +} + +ccl_device_inline void film_get_pass_pixel_sample_count( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see + * meaningful value when adaptive sampler stopped rendering image way before the maximum + * number of samples was reached (for examples when number of samples is set to 0 in + * viewport). */ + + kernel_assert(kfilm_convert->num_components >= 1); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float *in = buffer + kfilm_convert->pass_offset; + const float f = *in; + + pixel[0] = __float_as_uint(f) * kfilm_convert->scale; +} + +ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 1); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer); + + const float *in = buffer + kfilm_convert->pass_offset; + const float f = *in; + + pixel[0] = f * scale_exposure; +} + +/* -------------------------------------------------------------------- + * Float 3 passes. + */ + +ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 3); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + /* Read light pass. */ + const float *in = buffer + kfilm_convert->pass_offset; + float3 f = make_float3(in[0], in[1], in[2]); + + /* Optionally add indirect light pass. */ + if (kfilm_convert->pass_indirect != PASS_UNUSED) { + const float *in_indirect = buffer + kfilm_convert->pass_indirect; + const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]); + f += f_indirect; + } + + /* Optionally divide out color. */ + if (kfilm_convert->pass_divide != PASS_UNUSED) { + const float *in_divide = buffer + kfilm_convert->pass_divide; + const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]); + f = safe_divide_even_color(f, f_divide); + + /* Exposure only, sample scale cancels out. */ + f *= kfilm_convert->exposure; + } + else { + /* Sample scale and exposure. */ + f *= film_get_scale_exposure(kfilm_convert, buffer); + } + + pixel[0] = f.x; + pixel[1] = f.y; + pixel[2] = f.z; +} + +ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 3); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer); + + const float *in = buffer + kfilm_convert->pass_offset; + + const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure; + + pixel[0] = f.x; + pixel[1] = f.y; + pixel[2] = f.z; +} + +/* -------------------------------------------------------------------- + * Float4 passes. + */ + +ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components == 4); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED); + + const float *in = buffer + kfilm_convert->pass_offset; + const float *in_weight = buffer + kfilm_convert->pass_motion_weight; + + const float weight = in_weight[0]; + const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f; + + const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv; + + pixel[0] = motion.x; + pixel[1] = motion.y; + pixel[2] = motion.z; + pixel[3] = motion.w; +} + +ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components == 4); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + const float scale = film_get_scale(kfilm_convert, buffer); + + const float *in = buffer + kfilm_convert->pass_offset; + + const float4 f = make_float4(in[0], in[1], in[2], in[3]); + + /* x and z contain integer IDs, don't rescale them. + * y and w contain matte weights, they get scaled. */ + pixel[0] = f.x; + pixel[1] = f.y * scale; + pixel[2] = f.z; + pixel[3] = f.w * scale; +} + +ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components == 4); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + float scale, scale_exposure; + film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure); + + const float *in = buffer + kfilm_convert->pass_offset; + + const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure; + const float alpha = in[3] * scale; + + pixel[0] = color.x; + pixel[1] = color.y; + pixel[2] = color.z; + pixel[3] = alpha; +} + +ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict + kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components == 4); + + /* 3rd channel contains transparency = 1 - alpha for the combined pass. */ + + kernel_assert(kfilm_convert->num_components == 4); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + + float scale, scale_exposure; + if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) { + pixel[0] = 0.0f; + pixel[1] = 0.0f; + pixel[2] = 0.0f; + pixel[3] = 0.0f; + return; } - return pass_result; + const float *in = buffer + kfilm_convert->pass_offset; + + const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure; + const float alpha = in[3] * scale; + + pixel[0] = color.x; + pixel[1] = color.y; + pixel[2] = color.z; + pixel[3] = film_transparency_to_alpha(alpha); } -ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale) +/* -------------------------------------------------------------------- + * Shadow catcher. + */ + +ccl_device_inline float3 +film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer) { - float4 result; + kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED); - /* Conversion to SRGB. */ - result.x = color_linear_to_srgb(rgba_in.x * scale); - result.y = color_linear_to_srgb(rgba_in.y * scale); - result.z = color_linear_to_srgb(rgba_in.z * scale); + float scale, scale_exposure; + film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure); - /* Clamp since alpha might be > 1.0 due to Russian roulette. */ - result.w = saturate(rgba_in.w * scale); + ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher; - return result; + const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure; + + return pixel; } -ccl_device uchar4 film_float_to_byte(float4 color) +ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b) { - uchar4 result; + float x, y, z; - /* simple float to byte conversion */ - result.x = (uchar)(saturate(color.x) * 255.0f); - result.y = (uchar)(saturate(color.y) * 255.0f); - result.z = (uchar)(saturate(color.z) * 255.0f); - result.w = (uchar)(saturate(color.w) * 255.0f); + x = (b.x != 0.0f) ? a.x / b.x : 1.0f; + y = (b.y != 0.0f) ? a.y / b.y : 1.0f; + z = (b.z != 0.0f) ? a.z / b.z : 1.0f; - return result; + return make_float3(x, y, z); } -ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride) +ccl_device_inline float3 +film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer) { - /* buffer offset */ - int index = offset + x + y * stride; + /* For the shadow catcher pass we divide combined pass by the shadow catcher. + * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not + * to be calculated as division). */ - bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1); - float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale); + if (kfilm_convert->is_denoised) { + return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer); + } - /* map colors */ - float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f); - uchar4 uchar_result = film_float_to_byte(float_result); + kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED); - rgba += index; - *rgba = uchar_result; + /* If there is no shadow catcher object in this pixel, there is no modification of the light + * needed, so return one. */ + ccl_global const float *in_catcher_sample_count = + buffer + kfilm_convert->pass_shadow_catcher_sample_count; + const float num_samples = in_catcher_sample_count[0]; + if (num_samples == 0.0f) { + return one_float3(); + } + + kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED); + ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher; + + /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual + * shadow catcher objects in the scene. In this case there will be no auxillary passes required + * for the devision (to save up memory). So delay the asserts to this point so that the number of + * samples check handles such configuration. */ + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED); + kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED); + + ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined; + ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte; + + /* No scaling needed. The integration works in way that number of samples in the combined and + * shadow catcher passes are the same, and exposure is cancelled during the division. */ + const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]); + const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]); + const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]); + + /* Need to ignore contribution of the matte object when doing division (otherwise there will be + * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need + * to contain matte objects, we subtrack matte objects contribution here. This is the same as if + * the matte objects were not accumulated to the combined pass. */ + const float3 combined_no_matte = color_combined - color_matte; + + const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher); + + const float scale = film_get_scale(kfilm_convert, buffer); + const float transparency = in_combined[3] * scale; + const float alpha = film_transparency_to_alpha(transparency); + + /* Alpha-over on white using transparency of the combined pass. This allows to eliminate + * artifacts which are happenning on an edge of a shadow catcher when using transparent film. + * Note that we treat shadow catcher as straight alpha here because alpha got cancelled out + * during the division. */ + const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher; + + return pixel; } -ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride) +ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer) { - /* buffer offset */ - int index = offset + x + y * stride; + /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation + * is possible. + * + * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage, + * and then alpha-overing synthetic objects on top). */ - bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1); - float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale); + kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); + kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED); + kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED); + + float scale, scale_exposure; + if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte; + + const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer); + const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure; + + const float transparency = in_matte[3] * scale; + const float alpha = saturate(1.0f - transparency); + + const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha; + + if (kfilm_convert->use_approximate_shadow_catcher_background) { + kernel_assert(kfilm_convert->pass_background != PASS_UNUSED); + + ccl_global const float *in_background = buffer + kfilm_convert->pass_background; + const float3 color_background = make_float3( + in_background[0], in_background[1], in_background[2]) * + scale_exposure; + const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte); + return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f); + } - ccl_global half *out = (ccl_global half *)rgba + index * 4; - float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f); + return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte); +} + +ccl_device_inline void film_get_pass_pixel_shadow_catcher( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components >= 3); + + const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer); + + pixel[0] = pixel_value.x; + pixel[1] = pixel_value.y; + pixel[2] = pixel_value.z; +} + +ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4); + + const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert, + buffer); + + pixel[0] = pixel_value.x; + pixel[1] = pixel_value.y; + pixel[2] = pixel_value.z; + if (kfilm_convert->num_components == 4) { + pixel[3] = pixel_value.w; + } +} + +/* -------------------------------------------------------------------- + * Compositing and overlays. + */ + +ccl_device_inline void film_apply_pass_pixel_overlays_rgba( + const KernelFilmConvert *ccl_restrict kfilm_convert, + ccl_global const float *ccl_restrict buffer, + float *ccl_restrict pixel) +{ + if (kfilm_convert->show_active_pixels && + kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) { + if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) { + const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f); + const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f); + pixel[0] = mix_rgb.x; + pixel[1] = mix_rgb.y; + pixel[2] = mix_rgb.z; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h deleted file mode 100644 index 70aed6d54ed..00000000000 --- a/intern/cycles/kernel/kernel_globals.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Constant Globals */ - -#ifndef __KERNEL_GLOBALS_H__ -#define __KERNEL_GLOBALS_H__ - -#include "kernel/kernel_profiling.h" - -#ifdef __KERNEL_CPU__ -# include "util/util_map.h" -# include "util/util_vector.h" -#endif - -#ifdef __KERNEL_OPENCL__ -# include "util/util_atomic.h" -#endif - -CCL_NAMESPACE_BEGIN - -/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in - * the kernel, to access constant data. These are all stored as "textures", but - * these are really just standard arrays. We can't use actually globals because - * multiple renders may be running inside the same process. */ - -#ifdef __KERNEL_CPU__ - -# ifdef __OSL__ -struct OSLGlobals; -struct OSLThreadData; -struct OSLShadingSystem; -# endif - -typedef unordered_map<float, float> CoverageMap; - -struct Intersection; -struct VolumeStep; - -typedef struct KernelGlobals { -# define KERNEL_TEX(type, name) texture<type> name; -# include "kernel/kernel_textures.h" - - KernelData __data; - -# ifdef __OSL__ - /* On the CPU, we also have the OSL globals here. Most data structures are shared - * with SVM, the difference is in the shaders and object/mesh attributes. */ - OSLGlobals *osl; - OSLShadingSystem *osl_ss; - OSLThreadData *osl_tdata; -# endif - - /* **** Run-time data **** */ - - /* Heap-allocated storage for transparent shadows intersections. */ - Intersection *transparent_shadow_intersections; - - /* Storage for decoupled volume steps. */ - VolumeStep *decoupled_volume_steps[2]; - int decoupled_volume_steps_index; - - /* A buffer for storing per-pixel coverage for Cryptomatte. */ - CoverageMap *coverage_object; - CoverageMap *coverage_material; - CoverageMap *coverage_asset; - - /* split kernel */ - SplitData split_data; - SplitParams split_param_data; - - int2 global_size; - int2 global_id; - - ProfilingState profiler; -} KernelGlobals; - -#endif /* __KERNEL_CPU__ */ - -#ifdef __KERNEL_OPTIX__ - -typedef struct ShaderParams { - uint4 *input; - float4 *output; - int type; - int filter; - int sx; - int offset; - int sample; -} ShaderParams; - -typedef struct KernelParams { - WorkTile tile; - KernelData data; - ShaderParams shader; -# define KERNEL_TEX(type, name) const type *name; -# include "kernel/kernel_textures.h" -} KernelParams; - -typedef struct KernelGlobals { -# ifdef __VOLUME__ - VolumeState volume_state; -# endif - Intersection hits_stack[64]; -} KernelGlobals; - -extern "C" __constant__ KernelParams __params; - -#else /* __KERNEL_OPTIX__ */ - -/* For CUDA, constant memory textures must be globals, so we can't put them - * into a struct. As a result we don't actually use this struct and use actual - * globals and simply pass along a NULL pointer everywhere, which we hope gets - * optimized out. */ - -# ifdef __KERNEL_CUDA__ - -__constant__ KernelData __data; -typedef struct KernelGlobals { - /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ - Intersection hits_stack[64]; -} KernelGlobals; - -# define KERNEL_TEX(type, name) const __constant__ __device__ type *name; -# include "kernel/kernel_textures.h" - -# endif /* __KERNEL_CUDA__ */ - -#endif /* __KERNEL_OPTIX__ */ - -/* OpenCL */ - -#ifdef __KERNEL_OPENCL__ - -# define KERNEL_TEX(type, name) typedef type name##_t; -# include "kernel/kernel_textures.h" - -typedef ccl_addr_space struct KernelGlobals { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; - -# define KERNEL_TEX(type, name) TextureInfo name; -# include "kernel/kernel_textures.h" - -# ifdef __SPLIT_KERNEL__ - SplitData split_data; - SplitParams split_param_data; -# endif -} KernelGlobals; - -# define KERNEL_BUFFER_PARAMS \ - ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \ - ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \ - ccl_global char *buffer6, ccl_global char *buffer7 - -# define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7 - -ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS) -{ -# ifdef __SPLIT_KERNEL__ - if (ccl_local_id(0) + ccl_local_id(1) == 0) -# endif - { - kg->buffers[0] = buffer0; - kg->buffers[1] = buffer1; - kg->buffers[2] = buffer2; - kg->buffers[3] = buffer3; - kg->buffers[4] = buffer4; - kg->buffers[5] = buffer5; - kg->buffers[6] = buffer6; - kg->buffers[7] = buffer7; - } - -# ifdef __SPLIT_KERNEL__ - ccl_barrier(CCL_LOCAL_MEM_FENCE); -# endif -} - -ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) -{ -# ifdef __SPLIT_KERNEL__ - if (ccl_local_id(0) + ccl_local_id(1) == 0) -# endif - { - ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0]; - -# define KERNEL_TEX(type, name) kg->name = *(info++); -# include "kernel/kernel_textures.h" - } - -# ifdef __SPLIT_KERNEL__ - ccl_barrier(CCL_LOCAL_MEM_FENCE); -# endif -} - -#endif /* __KERNEL_OPENCL__ */ - -/* Interpolated lookup table access */ - -ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size) -{ - x = saturate(x) * (size - 1); - - int index = min(float_to_int(x), size - 1); - int nindex = min(index + 1, size - 1); - float t = x - index; - - float data0 = kernel_tex_fetch(__lookup_table, index + offset); - if (t == 0.0f) - return data0; - - float data1 = kernel_tex_fetch(__lookup_table, nindex + offset); - return (1.0f - t) * data0 + t * data1; -} - -ccl_device float lookup_table_read_2D( - KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize) -{ - y = saturate(y) * (ysize - 1); - - int index = min(float_to_int(y), ysize - 1); - int nindex = min(index + 1, ysize - 1); - float t = y - index; - - float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize); - if (t == 0.0f) - return data0; - - float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize); - return (1.0f - t) * data0 + t * data1; -} - -CCL_NAMESPACE_END - -#endif /* __KERNEL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h index 1ca42e933d1..ed01f494f98 100644 --- a/intern/cycles/kernel/kernel_id_passes.h +++ b/intern/cycles/kernel/kernel_id_passes.h @@ -14,8 +14,18 @@ * limitations under the License. */ +#pragma once + CCL_NAMESPACE_BEGIN +/* Element of ID pass stored in the render buffers. + * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the + * render buffers might not meet expected by compiler alignment. */ +typedef struct IDPassBufferElement { + float x; + float y; +} IDPassBufferElement; + ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, int num_slots, float id, @@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, } for (int slot = 0; slot < num_slots; slot++) { - ccl_global float2 *id_buffer = (ccl_global float2 *)buffer; + ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer; #ifdef __ATOMIC_PASS_WRITE__ /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */ if (id_buffer[slot].x == ID_NONE) { @@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots) { - ccl_global float2 *id_buffer = (ccl_global float2 *)buffer; + ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer; for (int slot = 1; slot < num_slots; ++slot) { if (id_buffer[slot].x == ID_NONE) { return; @@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */ int i = slot; while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) { - float2 swap = id_buffer[i]; + const IDPassBufferElement swap = id_buffer[i]; id_buffer[i] = id_buffer[i - 1]; id_buffer[i - 1] = swap; --i; @@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl } } -#ifdef __KERNEL_GPU__ /* post-sorting for Cryptomatte */ -ccl_device void kernel_cryptomatte_post( - KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride) +ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg, + ccl_global float *render_buffer, + int pixel_index) { - if (sample - 1 == kernel_data.integrator.aa_samples) { - int index = offset + x + y * stride; - int pass_stride = kernel_data.film.pass_stride; - ccl_global float *cryptomatte_buffer = buffer + index * pass_stride + - kernel_data.film.pass_cryptomatte; - kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth); - } + const int pass_stride = kernel_data.film.pass_stride; + const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride; + ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset + + kernel_data.film.pass_cryptomatte; + kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth); } -#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index f4e60a807f7..354e8115538 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -14,93 +14,27 @@ * limitations under the License. */ -/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */ - +#pragma once CCL_NAMESPACE_BEGIN -/* "Correlated Multi-Jittered Sampling" - * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */ - -/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */ -#define CMJ_RANDOM_OFFSET_LIMIT 4096 - -ccl_device_inline bool cmj_is_pow2(int i) +ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed) { - return (i > 1) && ((i & (i - 1)) == 0); -} + x += seed; + x ^= (x * 0x6c50b47cu); + x ^= x * 0xb82f1e52u; + x ^= x * 0xc7afe638u; + x ^= x * 0x8d22f6e6u; -ccl_device_inline int cmj_fast_mod_pow2(int a, int b) -{ - return (a & (b - 1)); + return x; } -/* b must be > 1 */ -ccl_device_inline int cmj_fast_div_pow2(int a, int b) +ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed) { - kernel_assert(b > 1); - return a >> count_trailing_zeros(b); -} + x = reverse_integer_bits(x); + x = laine_karras_permutation(x, seed); + x = reverse_integer_bits(x); -ccl_device_inline uint cmj_w_mask(uint w) -{ - kernel_assert(w > 1); - return ((1 << (32 - count_leading_zeros(w))) - 1); -} - -ccl_device_inline uint cmj_permute(uint i, uint l, uint p) -{ - uint w = l - 1; - - if ((l & w) == 0) { - /* l is a power of two (fast) */ - i ^= p; - i *= 0xe170893d; - i ^= p >> 16; - i ^= (i & w) >> 4; - i ^= p >> 8; - i *= 0x0929eb3f; - i ^= p >> 23; - i ^= (i & w) >> 1; - i *= 1 | p >> 27; - i *= 0x6935fa69; - i ^= (i & w) >> 11; - i *= 0x74dcb303; - i ^= (i & w) >> 2; - i *= 0x9e501cc3; - i ^= (i & w) >> 2; - i *= 0xc860a3df; - i &= w; - i ^= i >> 5; - - return (i + p) & w; - } - else { - /* l is not a power of two (slow) */ - w = cmj_w_mask(w); - - do { - i ^= p; - i *= 0xe170893d; - i ^= p >> 16; - i ^= (i & w) >> 4; - i ^= p >> 8; - i *= 0x0929eb3f; - i ^= p >> 23; - i ^= (i & w) >> 1; - i *= 1 | p >> 27; - i *= 0x6935fa69; - i ^= (i & w) >> 11; - i *= 0x74dcb303; - i ^= (i & w) >> 2; - i *= 0x9e501cc3; - i ^= (i & w) >> 2; - i *= 0xc860a3df; - i &= w; - i ^= i >> 5; - } while (i >= l); - - return (i + p) % l; - } + return x; } ccl_device_inline uint cmj_hash(uint i, uint p) @@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p) return cmj_hash(i, p) * (1.0f / 4294967808.0f); } -#ifdef __CMJ__ -ccl_device float cmj_sample_1D(int s, int N, int p) +ccl_device_inline float cmj_randfloat_simple(uint i, uint p) { - kernel_assert(s < N); - - uint x = cmj_permute(s, N, p * 0x68bc21eb); - float jx = cmj_randfloat(s, p * 0x967a889b); - - float invN = 1.0f / N; - return (x + jx) * invN; + return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF); } -/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */ -ccl_device_inline int cmj_isqrt(int value) +ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension) { -# if defined(__KERNEL_CUDA__) - return float_to_int(__fsqrt_ru(value)); -# elif defined(__KERNEL_GPU__) - return float_to_int(sqrtf(value)); -# else - /* This is a work around for fast-math on CPU which might replace sqrtf() - * with am approximated version. - */ - return float_to_int(sqrtf(value) + 1e-6f); -# endif -} + /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D + * the x part is used as the sample (TODO(@leesonw): Add using both x and y parts + * independently). */ + + /* Perform Owen shuffle of the sample number to reorder the samples. */ +#ifdef _SIMPLE_HASH_ + const uint rv = cmj_hash_simple(dimension, rng_hash); +#else /* Use a _REGULAR_HASH_. */ + const uint rv = cmj_hash(dimension, rng_hash); +#endif +#ifdef _XOR_SHUFFLE_ +# warning "Using XOR shuffle." + const uint s = sample ^ rv; +#else /* Use _OWEN_SHUFFLE_ for reordering. */ + const uint s = nested_uniform_scramble(sample, rv); +#endif -ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) -{ - kernel_assert(s < N); + /* Based on the sample number a sample pattern is selected and offset by the dimension. */ + const uint sample_set = s / NUM_PMJ_SAMPLES; + const uint d = (dimension + sample_set); + const uint dim = d % NUM_PMJ_PATTERNS; + int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)); + + float fx = kernel_tex_fetch(__sample_pattern_lut, index); - int m = cmj_isqrt(N); - int n = (N - 1) / m + 1; - float invN = 1.0f / N; - float invm = 1.0f / m; - float invn = 1.0f / n; +#ifndef _NO_CRANLEY_PATTERSON_ROTATION_ + /* Use Cranley-Patterson rotation to displace the sample pattern. */ +# ifdef _SIMPLE_HASH_ + float dx = cmj_randfloat_simple(d, rng_hash); +# else + /* Only jitter within the grid interval. */ + float dx = cmj_randfloat(d, rng_hash); +# endif + fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES); + fx = fx - floorf(fx); - s = cmj_permute(s, N, p * 0x51633e2d); +#else +# warning "Not using Cranley-Patterson Rotation." +#endif - int sdivm, smodm; + return fx; +} - if (cmj_is_pow2(m)) { - sdivm = cmj_fast_div_pow2(s, m); - smodm = cmj_fast_mod_pow2(s, m); - } - else { - /* Doing `s * inmv` gives precision issues here. */ - sdivm = s / m; - smodm = s - sdivm * m; - } +ccl_device void pmj_sample_2D( + const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y) +{ + /* Perform a shuffle on the sample number to reorder the samples. */ +#ifdef _SIMPLE_HASH_ + const uint rv = cmj_hash_simple(dimension, rng_hash); +#else /* Use a _REGULAR_HASH_. */ + const uint rv = cmj_hash(dimension, rng_hash); +#endif +#ifdef _XOR_SHUFFLE_ +# warning "Using XOR shuffle." + const uint s = sample ^ rv; +#else /* Use _OWEN_SHUFFLE_ for reordering. */ + const uint s = nested_uniform_scramble(sample, rv); +#endif - uint sx = cmj_permute(smodm, m, p * 0x68bc21eb); - uint sy = cmj_permute(sdivm, n, p * 0x02e5be93); + /* Based on the sample number a sample pattern is selected and offset by the dimension. */ + const uint sample_set = s / NUM_PMJ_SAMPLES; + const uint d = (dimension + sample_set); + const uint dim = d % NUM_PMJ_PATTERNS; + int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)); - float jx = cmj_randfloat(s, p * 0x967a889b); - float jy = cmj_randfloat(s, p * 0x368cc8b7); + float fx = kernel_tex_fetch(__sample_pattern_lut, index); + float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1); - *fx = (sx + (sy + jx) * invn) * invm; - *fy = (s + jy) * invN; -} +#ifndef _NO_CRANLEY_PATTERSON_ROTATION_ + /* Use Cranley-Patterson rotation to displace the sample pattern. */ +# ifdef _SIMPLE_HASH_ + float dx = cmj_randfloat_simple(d, rng_hash); + float dy = cmj_randfloat_simple(d + 1, rng_hash); +# else + float dx = cmj_randfloat(d, rng_hash); + float dy = cmj_randfloat(d + 1, rng_hash); +# endif + /* Only jitter within the grid cells. */ + fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS); + fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS); + fx = fx - floorf(fx); + fy = fy - floorf(fy); +#else +# warning "Not using Cranley Patterson Rotation." #endif -ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension) -{ - /* Fallback to random */ - if (sample >= NUM_PMJ_SAMPLES) { - const int p = rng_hash + dimension; - return cmj_randfloat(sample, p); - } - else { - const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff; - const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2; - return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f; - } -} - -ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension) -{ - if (sample >= NUM_PMJ_SAMPLES) { - const int p = rng_hash + dimension; - const float fx = cmj_randfloat(sample, p); - const float fy = cmj_randfloat(sample, p + 1); - return make_float2(fx, fy); - } - else { - const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2; - const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff; - const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff; - const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f; - const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) - - 1.0f; - return make_float2(fx, fy); - } + (*x) = fx; + (*y) = fy; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 42a834d2ce3..52f641634b9 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -14,7 +14,14 @@ * limitations under the License. */ +#pragma once + +#include "geom/geom.h" + #include "kernel_light_background.h" +#include "kernel_montecarlo.h" +#include "kernel_projection.h" +#include "kernel_types.h" CCL_NAMESPACE_BEGIN @@ -37,10 +44,22 @@ typedef struct LightSample { /* Regular Light */ -ccl_device_inline bool lamp_light_sample( - KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls) +template<bool in_volume_segment> +ccl_device_inline bool light_sample(const KernelGlobals *kg, + const int lamp, + const float randu, + const float randv, + const float3 P, + const int path_flag, + LightSample *ls) { const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); + if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) { + if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) { + return false; + } + } + LightType type = (LightType)klight->type; ls->type = type; ls->shader = klight->shader_id; @@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample( ls->u = randu; ls->v = randv; + if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) { + /* Distant lights in a volume get a dummy sample, position will not actually + * be used in that case. Only when sampling from a specific scatter position + * do we actually need to evaluate these. */ + ls->P = zero_float3(); + ls->Ng = zero_float3(); + ls->D = zero_float3(); + ls->pdf = true; + ls->t = FLT_MAX; + return true; + } + if (type == LIGHT_DISTANT) { /* distant light */ float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]); @@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample( float invarea = fabsf(klight->area.invarea); bool is_round = (klight->area.invarea < 0.0f); - if (dot(ls->P - P, Ng) > 0.0f) { - return false; + if (!in_volume_segment) { + if (dot(ls->P - P, Ng) > 0.0f) { + return false; + } } float3 inplane; - if (is_round) { + if (is_round || in_volume_segment) { inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv); ls->P += inplane; ls->pdf = invarea; @@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample( return (ls->pdf > 0.0f); } -ccl_device bool lamp_light_eval( - KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls) +ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg, + const Ray *ccl_restrict ray, + Intersection *ccl_restrict isect, + const int last_prim, + const int last_object, + const int last_type, + const int path_flag) { - const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); - LightType type = (LightType)klight->type; - ls->type = type; - ls->shader = klight->shader_id; - ls->object = PRIM_NONE; - ls->prim = PRIM_NONE; - ls->lamp = lamp; - /* todo: missing texture coordinates */ - ls->u = 0.0f; - ls->v = 0.0f; + for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) { + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); - if (!(ls->shader & SHADER_USE_MIS)) - return false; + if (path_flag & PATH_RAY_CAMERA) { + if (klight->shader_id & SHADER_EXCLUDE_CAMERA) { + continue; + } + } + else { + if (!(klight->shader_id & SHADER_USE_MIS)) { + continue; + } + } - if (type == LIGHT_DISTANT) { - /* distant light */ - float radius = klight->distant.radius; + if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) { + if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) { + continue; + } + } - if (radius == 0.0f) - return false; - if (t != FLT_MAX) - return false; + LightType type = (LightType)klight->type; + float t = 0.0f, u = 0.0f, v = 0.0f; - /* a distant light is infinitely far away, but equivalent to a disk - * shaped light exactly 1 unit away from the current shading point. - * - * radius t^2/cos(theta) - * <----------> t = sqrt(1^2 + tan(theta)^2) - * tan(th) area = radius*radius*pi - * <-----> - * \ | (1 + tan(theta)^2)/cos(theta) - * \ | (1 + tan(acos(cos(theta)))^2)/cos(theta) - * t \th| 1 simplifies to - * \-| 1/(cos(theta)^3) - * \| magic! - * P - */ + if (type == LIGHT_POINT || type == LIGHT_SPOT) { + /* Sphere light. */ + const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]); + const float radius = klight->spot.radius; + if (radius == 0.0f) { + continue; + } - float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]); - float costheta = dot(-lightD, D); - float cosangle = klight->distant.cosangle; + float3 P; + if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) { + continue; + } + } + else if (type == LIGHT_AREA) { + /* Area light. */ + const float invarea = fabsf(klight->area.invarea); + const bool is_round = (klight->area.invarea < 0.0f); + if (invarea == 0.0f) { + continue; + } - if (costheta < cosangle) - return false; + const float3 axisu = make_float3( + klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); + const float3 axisv = make_float3( + klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); + const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]); - ls->P = -D; - ls->Ng = -D; - ls->D = D; - ls->t = FLT_MAX; + /* One sided. */ + if (dot(ray->D, Ng) >= 0.0f) { + continue; + } - /* compute pdf */ - float invarea = klight->distant.invarea; - ls->pdf = invarea / (costheta * costheta * costheta); - ls->eval_fac = ls->pdf; + const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]); + + float3 P; + if (!ray_quad_intersect( + ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) { + continue; + } + } + else { + continue; + } + + if (t < isect->t && + !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) { + isect->t = t; + isect->u = u; + isect->v = v; + isect->type = PRIMITIVE_LAMP; + isect->prim = lamp; + isect->object = OBJECT_NONE; + } + } + + return isect->prim != PRIM_NONE; +} + +ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg, + const float3 ray_D, + const int lamp, + LightSample *ccl_restrict ls) +{ + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); + const int shader = klight->shader_id; + const float radius = klight->distant.radius; + const LightType type = (LightType)klight->type; + + if (type != LIGHT_DISTANT) { + return false; + } + if (!(shader & SHADER_USE_MIS)) { + return false; + } + if (radius == 0.0f) { + return false; } - else if (type == LIGHT_POINT || type == LIGHT_SPOT) { - float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]); - float radius = klight->spot.radius; + /* a distant light is infinitely far away, but equivalent to a disk + * shaped light exactly 1 unit away from the current shading point. + * + * radius t^2/cos(theta) + * <----------> t = sqrt(1^2 + tan(theta)^2) + * tan(th) area = radius*radius*pi + * <-----> + * \ | (1 + tan(theta)^2)/cos(theta) + * \ | (1 + tan(acos(cos(theta)))^2)/cos(theta) + * t \th| 1 simplifies to + * \-| 1/(cos(theta)^3) + * \| magic! + * P + */ + + float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]); + float costheta = dot(-lightD, ray_D); + float cosangle = klight->distant.cosangle; + + if (costheta < cosangle) + return false; - /* sphere light */ - if (radius == 0.0f) - return false; + ls->type = type; + ls->shader = klight->shader_id; + ls->object = PRIM_NONE; + ls->prim = PRIM_NONE; + ls->lamp = lamp; + /* todo: missing texture coordinates */ + ls->u = 0.0f; + ls->v = 0.0f; + ls->t = FLT_MAX; + ls->P = -ray_D; + ls->Ng = -ray_D; + ls->D = ray_D; + + /* compute pdf */ + float invarea = klight->distant.invarea; + ls->pdf = invarea / (costheta * costheta * costheta); + ls->pdf *= kernel_data.integrator.pdf_lights; + ls->eval_fac = ls->pdf; - if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) { - return false; - } + return true; +} - ls->Ng = -D; - ls->D = D; +ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg, + const Intersection *ccl_restrict isect, + const float3 ray_P, + const float3 ray_D, + LightSample *ccl_restrict ls) +{ + const int lamp = isect->prim; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); + LightType type = (LightType)klight->type; + ls->type = type; + ls->shader = klight->shader_id; + ls->object = PRIM_NONE; + ls->prim = PRIM_NONE; + ls->lamp = lamp; + /* todo: missing texture coordinates */ + ls->t = isect->t; + ls->P = ray_P + ray_D * ls->t; + ls->D = ray_D; + + if (type == LIGHT_POINT || type == LIGHT_SPOT) { + ls->Ng = -ray_D; float invarea = klight->spot.invarea; ls->eval_fac = (0.25f * M_1_PI_F) * invarea; @@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval( ls->eval_fac *= spot_light_attenuation( dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng); - if (ls->eval_fac == 0.0f) + if (ls->eval_fac == 0.0f) { return false; + } } float2 uv = map_to_sphere(ls->Ng); ls->u = uv.x; @@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval( else if (type == LIGHT_AREA) { /* area light */ float invarea = fabsf(klight->area.invarea); - bool is_round = (klight->area.invarea < 0.0f); - if (invarea == 0.0f) - return false; float3 axisu = make_float3( klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); float3 axisv = make_float3( klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]); - - /* one sided */ - if (dot(D, Ng) >= 0.0f) - return false; - float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]); - if (!ray_quad_intersect( - P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) { - return false; - } - - ls->D = D; + ls->u = isect->u; + ls->v = isect->v; + ls->D = ray_D; ls->Ng = Ng; + + const bool is_round = (klight->area.invarea < 0.0f); if (is_round) { - ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t); + ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t); } else { float3 sample_axisu = axisu; @@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval( if (klight->area.tan_spread > 0.0f) { if (!light_spread_clamp_area_light( - P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) { + ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) { return false; } } - ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false); + ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false); } ls->eval_fac = 0.25f * invarea; @@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval( } } else { + kernel_assert(!"Invalid lamp type in light_sample_from_intersection"); return false; } @@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval( /* returns true if the triangle is has motion blur or an instancing transform applied */ ccl_device_inline bool triangle_world_space_vertices( - KernelGlobals *kg, int object, int prim, float time, float3 V[3]) + const KernelGlobals *kg, int object, int prim, float time, float3 V[3]) { bool has_motion = false; const int object_flag = kernel_tex_fetch(__object_flag, object); @@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices( return has_motion; } -ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, +ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg, const float3 Ng, const float3 I, float t) @@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, return t * t * pdf / cos_pi; } -ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t) +ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg, + const ShaderData *sd, + float t) { /* A naive heuristic to decide between costly solid angle sampling * and simple area sampling, comparing the distance to the triangle plane @@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s } } -ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, +template<bool in_volume_segment> +ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg, int prim, int object, float randu, @@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0)); - if (longest_edge_squared > distance_to_plane * distance_to_plane) { + if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) { /* see James Arvo, "Stratified Sampling of Spherical Triangles" * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */ @@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, /* Light Distribution */ -ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu) +ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu) { /* This is basically std::upper_bound as used by PBRT, to find a point light or * triangle to emit from, proportional to area. a good improvement would be to @@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu) /* Generic Light */ -ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce) +ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg, + int index, + int bounce) { return (bounce > kernel_tex_fetch(__lights, index).max_bounces); } -ccl_device_noinline bool light_sample(KernelGlobals *kg, - int lamp, - float randu, - float randv, - float time, - float3 P, - int bounce, - LightSample *ls) +template<bool in_volume_segment> +ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg, + float randu, + const float randv, + const float time, + const float3 P, + const int bounce, + const int path_flag, + LightSample *ls) { - if (lamp < 0) { - /* sample index */ - int index = light_distribution_sample(kg, &randu); - - /* fetch light data */ - const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch( - __light_distribution, index); - int prim = kdistribution->prim; - - if (prim >= 0) { - int object = kdistribution->mesh_light.object_id; - int shader_flag = kdistribution->mesh_light.shader_flag; - - triangle_light_sample(kg, prim, object, randu, randv, time, ls, P); - ls->shader |= shader_flag; - return (ls->pdf > 0.0f); + /* Sample light index from distribution. */ + const int index = light_distribution_sample(kg, &randu); + const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution, + index); + const int prim = kdistribution->prim; + + if (prim >= 0) { + /* Mesh light. */ + const int object = kdistribution->mesh_light.object_id; + + /* Exclude synthetic meshes from shadow catcher pass. */ + if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) && + !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) { + return false; } - lamp = -prim - 1; + const int shader_flag = kdistribution->mesh_light.shader_flag; + triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P); + ls->shader |= shader_flag; + return (ls->pdf > 0.0f); } + const int lamp = -prim - 1; + if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) { return false; } - return lamp_light_sample(kg, lamp, randu, randv, P, ls); + return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls); +} + +ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg, + float randu, + const float randv, + const float time, + const float3 P, + const int bounce, + const int path_flag, + LightSample *ls) +{ + return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls); +} + +ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg, + float randu, + const float randv, + const float time, + const float3 P, + const int bounce, + const int path_flag, + LightSample *ls) +{ + return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls); } -ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index) +ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg, + const float randu, + const float randv, + const float time, + const float3 P, + LightSample *ls) { - return kernel_tex_fetch(__lights, index).samples; + /* Sample a new position on the same light, for volume sampling. */ + if (ls->type == LIGHT_TRIANGLE) { + triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P); + return (ls->pdf > 0.0f); + } + else { + return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h index f0f64ce8704..493ed560bc6 100644 --- a/intern/cycles/kernel/kernel_light_background.h +++ b/intern/cycles/kernel/kernel_light_background.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include "kernel_light_common.h" CCL_NAMESPACE_BEGIN @@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN #ifdef __BACKGROUND_MIS__ -ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) +ccl_device float3 background_map_sample(const KernelGlobals *kg, + float randu, + float randv, + float *pdf) { /* for the following, the CDF values are actually a pair of floats, with the * function value as X and the actual CDF as Y. The last entry's function @@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra /* TODO(sergey): Same as above, after the release we should consider using * 'noinline' for all devices. */ -ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction) +ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction) { float2 uv = direction_to_equirectangular(direction); int res_x = kernel_data.background.map_res_x; @@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction) } ccl_device_inline bool background_portal_data_fetch_and_check_side( - KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir) + const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir) { int portal = kernel_data.background.portal_offset + index; const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); @@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side( } ccl_device_inline float background_portal_pdf( - KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible) + const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible) { float portal_pdf = 0.0f; @@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf( return (num_possible > 0) ? portal_pdf / num_possible : 0.0f; } -ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P) +ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P) { int num_possible_portals = 0; for (int p = 0; p < kernel_data.background.num_portals; p++) { @@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P) return num_possible_portals; } -ccl_device float3 background_portal_sample(KernelGlobals *kg, +ccl_device float3 background_portal_sample(const KernelGlobals *kg, float3 P, float randu, float randv, @@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg, return zero_float3(); } -ccl_device_inline float3 background_sun_sample(KernelGlobals *kg, +ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg, float randu, float randv, float *pdf) @@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg, return D; } -ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D) +ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D) { const float3 N = float4_to_float3(kernel_data.background.sun); const float angle = kernel_data.background.sun.w; @@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D) } ccl_device_inline float3 -background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) +background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) { float portal_method_pdf = kernel_data.background.portal_weight; float sun_method_pdf = kernel_data.background.sun_weight; @@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f return D; } -ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction) +ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction) { float portal_method_pdf = kernel_data.background.portal_weight; float sun_method_pdf = kernel_data.background.sun_weight; diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h index 4a683d36226..765d8f5338e 100644 --- a/intern/cycles/kernel/kernel_light_common.h +++ b/intern/cycles/kernel/kernel_light_common.h @@ -14,6 +14,10 @@ * limitations under the License. */ +#pragma once + +#include "kernel_montecarlo.h" + CCL_NAMESPACE_BEGIN /* Area light sampling */ @@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P, return true; } -ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t) +ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t) { float cos_pi = dot(Ng, I); diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h new file mode 100644 index 00000000000..33d9d5ae1f0 --- /dev/null +++ b/intern/cycles/kernel/kernel_lookup_table.h @@ -0,0 +1,56 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Interpolated lookup table access */ + +ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size) +{ + x = saturate(x) * (size - 1); + + int index = min(float_to_int(x), size - 1); + int nindex = min(index + 1, size - 1); + float t = x - index; + + float data0 = kernel_tex_fetch(__lookup_table, index + offset); + if (t == 0.0f) + return data0; + + float data1 = kernel_tex_fetch(__lookup_table, nindex + offset); + return (1.0f - t) * data0 + t * data1; +} + +ccl_device float lookup_table_read_2D( + const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize) +{ + y = saturate(y) * (ysize - 1); + + int index = min(float_to_int(y), ysize - 1); + int nindex = min(index + 1, ysize - 1); + float t = y - index; + + float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize); + if (t == 0.0f) + return data0; + + float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize); + return (1.0f - t) * data0 + t * data1; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 96391db7649..3c5ab95bbc8 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_MATH_H__ -#define __KERNEL_MATH_H__ +#pragma once #include "util/util_color.h" #include "util/util_math.h" @@ -24,5 +23,3 @@ #include "util/util_projection.h" #include "util/util_texture.h" #include "util/util_transform.h" - -#endif /* __KERNEL_MATH_H__ */ diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h index ce37bd0b15e..b158f4c4fd3 100644 --- a/intern/cycles/kernel/kernel_montecarlo.h +++ b/intern/cycles/kernel/kernel_montecarlo.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __KERNEL_MONTECARLO_CL__ -#define __KERNEL_MONTECARLO_CL__ +#pragma once CCL_NAMESPACE_BEGIN @@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N) } CCL_NAMESPACE_END - -#endif /* __KERNEL_MONTECARLO_CL__ */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 8f58b8c3079..67466b28170 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -14,61 +14,52 @@ * limitations under the License. */ +#pragma once + +#include "kernel/geom/geom.h" + #include "kernel/kernel_id_passes.h" +#include "kernel/kernel_write_passes.h" CCL_NAMESPACE_BEGIN -#ifdef __DENOISING_FEATURES__ - -ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, - ccl_global float *buffer, - int sample, - float path_total, - float path_total_shaded) +/* Get pointer to pixel in render buffer. */ +ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer( + INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer) { - if (kernel_data.film.pass_denoising_data == 0) - return; - - buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ? - DENOISING_PASS_SHADOW_B : - DENOISING_PASS_SHADOW_A; - - path_total = ensure_finite(path_total); - path_total_shaded = ensure_finite(path_total_shaded); - - kernel_write_pass_float(buffer, path_total); - kernel_write_pass_float(buffer + 1, path_total_shaded); - - float value = path_total_shaded / max(path_total, 1e-7f); - kernel_write_pass_float(buffer + 2, value * value); + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + return render_buffer + render_buffer_offset; } -ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - PathRadiance *L) +#ifdef __DENOISING_FEATURES__ + +ccl_device_forceinline void kernel_write_denoising_features_surface( + INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer) { - if (state->denoising_feature_weight == 0.0f) { + if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) { return; } - L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); - /* Skip implicitly transparent surfaces. */ if (sd->flag & SD_HAS_ONLY_VOLUME) { return; } + ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer); + float3 normal = zero_float3(); float3 diffuse_albedo = zero_float3(); float3 specular_albedo = zero_float3(); float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; - if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { continue; + } /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ normal += sc->N * sc->sample_weight; @@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, normal /= sum_weight; } - /* Transform normal into camera space. */ - const Transform worldtocamera = kernel_data.cam.worldtocamera; - normal = transform_direction(&worldtocamera, normal); + if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) { + /* Transform normal into camera space. */ + const Transform worldtocamera = kernel_data.cam.worldtocamera; + normal = transform_direction(&worldtocamera, normal); + + const float3 denoising_normal = ensure_finite3(normal); + kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal); + } - L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); - L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * - state->denoising_feature_throughput * diffuse_albedo); + if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) { + const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, + denoising_feature_throughput); + const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * + diffuse_albedo); + kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo); + } - state->denoising_feature_weight = 0.0f; + INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES; } else { - state->denoising_feature_throughput *= specular_albedo; + INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo; + } +} + +ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS, + const float3 albedo, + const bool scatter, + ccl_global float *ccl_restrict + render_buffer) +{ + ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer); + const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput); + + if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) { + /* Assume scatter is sufficiently diffuse to stop writing denoising features. */ + INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES; + + /* Write view direction as normal. */ + const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f); + kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal); + } + + if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) { + /* Write albedo. */ + const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo); + kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo); } } #endif /* __DENOISING_FEATURES__ */ -#ifdef __KERNEL_CPU__ -# define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \ - kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name) -ccl_device_inline size_t kernel_write_id_pass_cpu( - float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map) +#ifdef __SHADOW_CATCHER__ + +/* Write shadow catcher passes on a bounce from the shadow catcher object. */ +ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data( + INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer) { - if (map) { - (*map)[id] += matte_weight; - return 0; + if (!kernel_data.integrator.has_shadow_catcher) { + return; + } + + kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED); + kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED); + + if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) { + return; } -#else /* __KERNEL_CPU__ */ -# define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \ - kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight) -ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer, - size_t depth, - float id, - float matte_weight) + + ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer); + + /* Count sample for the shadow catcher object. */ + kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f); + + /* Since the split is done, the sample does not contribute to the matte, so accumulate it as + * transparency to the matte. */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, + average(throughput)); +} + +#endif /* __SHADOW_CATCHER__ */ + +ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer, + size_t depth, + float id, + float matte_weight) { -#endif /* __KERNEL_CPU__ */ - kernel_write_id_slots(buffer, depth, id, matte_weight); - return depth * 2; + kernel_write_id_slots(buffer, depth * 2, id, matte_weight); + return depth * 4; } -ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, - ccl_global float *buffer, - PathRadiance *L, - ShaderData *sd, - ccl_addr_space PathState *state, - float3 throughput) +ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS, + const ShaderData *sd, + ccl_global float *ccl_restrict render_buffer) { #ifdef __PASSES__ - int path_flag = state->flag; + const int path_flag = INTEGRATOR_STATE(path, flag); - if (!(path_flag & PATH_RAY_CAMERA)) + if (!(path_flag & PATH_RAY_CAMERA)) { return; + } - int flag = kernel_data.film.pass_flag; - int light_flag = kernel_data.film.light_pass_flag; + const int flag = kernel_data.film.pass_flag; - if (!((flag | light_flag) & PASS_ANY)) + if (!(flag & PASS_ANY)) { return; + } + + ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer); if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { - if (state->sample == 0) { + if (INTEGRATOR_STATE(path, sample) == 0) { if (flag & PASSMASK(DEPTH)) { - float depth = camera_z_depth(kg, sd->P); + const float depth = camera_z_depth(kg, sd->P); kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth); } if (flag & PASSMASK(OBJECT_ID)) { - float id = object_pass_id(kg, sd->object); + const float id = object_pass_id(kg, sd->object); kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id); } if (flag & PASSMASK(MATERIAL_ID)) { - float id = shader_pass_id(kg, sd); + const float id = shader_pass_id(kg, sd); kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id); } } + if (flag & PASSMASK(POSITION)) { + const float3 position = sd->P; + kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position); + } if (flag & PASSMASK(NORMAL)) { - float3 normal = shader_bsdf_average_normal(kg, sd); + const float3 normal = shader_bsdf_average_normal(kg, sd); kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal); } + if (flag & PASSMASK(ROUGHNESS)) { + const float roughness = shader_bsdf_average_roughness(sd); + kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness); + } if (flag & PASSMASK(UV)) { - float3 uv = primitive_uv(kg, sd); + const float3 uv = primitive_uv(kg, sd); kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv); } if (flag & PASSMASK(MOTION)) { - float4 speed = primitive_motion_vector(kg, sd); + const float4 speed = primitive_motion_vector(kg, sd); kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed); kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f); } - state->flag |= PATH_RAY_SINGLE_PASS_DONE; + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE; } } if (kernel_data.film.cryptomatte_passes) { + const float3 throughput = INTEGRATOR_STATE(path, throughput); const float matte_weight = average(throughput) * (1.0f - average(shader_bsdf_transparency(kg, sd))); if (matte_weight > 0.0f) { ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte; if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { - float id = object_cryptomatte_id(kg, sd->object); - cryptomatte_buffer += WRITE_ID_SLOT( - cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object); + const float id = object_cryptomatte_id(kg, sd->object); + cryptomatte_buffer += kernel_write_id_pass( + cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight); } if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { - float id = shader_cryptomatte_id(kg, sd->shader); - cryptomatte_buffer += WRITE_ID_SLOT( - cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material); + const float id = shader_cryptomatte_id(kg, sd->shader); + cryptomatte_buffer += kernel_write_id_pass( + cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight); } if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { - float id = object_cryptomatte_asset_id(kg, sd->object); - cryptomatte_buffer += WRITE_ID_SLOT( - cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset); + const float id = object_cryptomatte_asset_id(kg, sd->object); + cryptomatte_buffer += kernel_write_id_pass( + cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight); } } } - if (light_flag & PASSMASK_COMPONENT(DIFFUSE)) - L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput; - if (light_flag & PASSMASK_COMPONENT(GLOSSY)) - L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput; - if (light_flag & PASSMASK_COMPONENT(TRANSMISSION)) - L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput; - - if (light_flag & PASSMASK(MIST)) { - /* bring depth into 0..1 range */ - float mist_start = kernel_data.film.mist_start; - float mist_inv_depth = kernel_data.film.mist_inv_depth; + if (flag & PASSMASK(DIFFUSE_COLOR)) { + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, + shader_bsdf_diffuse(kg, sd) * throughput); + } + if (flag & PASSMASK(GLOSSY_COLOR)) { + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, + shader_bsdf_glossy(kg, sd) * throughput); + } + if (flag & PASSMASK(TRANSMISSION_COLOR)) { + const float3 throughput = INTEGRATOR_STATE(path, throughput); + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, + shader_bsdf_transmission(kg, sd) * throughput); + } + if (flag & PASSMASK(MIST)) { + /* Bring depth into 0..1 range. */ + const float mist_start = kernel_data.film.mist_start; + const float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, sd->P); + const float depth = camera_distance(kg, sd->P); float mist = saturate((depth - mist_start) * mist_inv_depth); - /* falloff */ - float mist_falloff = kernel_data.film.mist_falloff; + /* Falloff */ + const float mist_falloff = kernel_data.film.mist_falloff; if (mist_falloff == 1.0f) ; @@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, else mist = powf(mist, mist_falloff); - /* modulate by transparency */ - float3 alpha = shader_bsdf_alpha(kg, sd); - L->mist += (1.0f - mist) * average(throughput * alpha); - } -#endif -} + /* Modulate by transparency */ + const float3 throughput = INTEGRATOR_STATE(path, throughput); + const float3 alpha = shader_bsdf_alpha(kg, sd); + const float mist_output = (1.0f - mist) * average(throughput * alpha); -ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, - ccl_global float *buffer, - PathRadiance *L) -{ -#ifdef __PASSES__ - int light_flag = kernel_data.film.light_pass_flag; - - if (!kernel_data.film.use_light_pass) - return; - - if (light_flag & PASSMASK(DIFFUSE_INDIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse); - if (light_flag & PASSMASK(GLOSSY_INDIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy); - if (light_flag & PASSMASK(TRANSMISSION_INDIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, - L->indirect_transmission); - if (light_flag & PASSMASK(VOLUME_INDIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume); - if (light_flag & PASSMASK(DIFFUSE_DIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse); - if (light_flag & PASSMASK(GLOSSY_DIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy); - if (light_flag & PASSMASK(TRANSMISSION_DIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, - L->direct_transmission); - if (light_flag & PASSMASK(VOLUME_DIRECT)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume); - - if (light_flag & PASSMASK(EMISSION)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission); - if (light_flag & PASSMASK(BACKGROUND)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background); - if (light_flag & PASSMASK(AO)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao); - - if (light_flag & PASSMASK(DIFFUSE_COLOR)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse); - if (light_flag & PASSMASK(GLOSSY_COLOR)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy); - if (light_flag & PASSMASK(TRANSMISSION_COLOR)) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, - L->color_transmission); - if (light_flag & PASSMASK(SHADOW)) { - float3 shadow = L->shadow; - kernel_write_pass_float4( - buffer + kernel_data.film.pass_shadow, - make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale)); + /* Note that the final value in the render buffer we want is 1 - mist_output, + * to avoid having to tracking this in the Integrator state we do the negation + * after rendering. */ + kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output); } - if (light_flag & PASSMASK(MIST)) - kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist); #endif } -ccl_device_inline void kernel_write_result(KernelGlobals *kg, - ccl_global float *buffer, - int sample, - PathRadiance *L) -{ - PROFILING_INIT(kg, PROFILING_WRITE_RESULT); - PROFILING_OBJECT(PRIM_NONE); - - float alpha; - float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha); - - if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) { - kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); - } - - kernel_write_light_passes(kg, buffer, L); - -#ifdef __DENOISING_FEATURES__ - if (kernel_data.film.pass_denoising_data) { -# ifdef __SHADOW_TRICKS__ - kernel_write_denoising_shadow(kg, - buffer + kernel_data.film.pass_denoising_data, - sample, - average(L->path_total), - average(L->path_total_shaded)); -# else - kernel_write_denoising_shadow( - kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); -# endif - if (kernel_data.film.pass_denoising_clean) { - float3 noisy, clean; - path_radiance_split_denoising(kg, L, &noisy, &clean); - kernel_write_pass_float3_variance( - buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy); - kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean); - } - else { - kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + - DENOISING_PASS_COLOR, - ensure_finite3(L_sum)); - } - - kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + - DENOISING_PASS_NORMAL, - L->denoising_normal); - kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + - DENOISING_PASS_ALBEDO, - L->denoising_albedo); - kernel_write_pass_float_variance( - buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth); - } -#endif /* __DENOISING_FEATURES__ */ - - /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping - criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte - Carlo global illumination" except that here it is applied per pixel and not in hierarchical - tiles. */ - if (kernel_data.film.pass_adaptive_aux_buffer && - kernel_data.integrator.adaptive_threshold > 0.0f) { - if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) { - kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer, - make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f)); - } -#ifdef __KERNEL_CPU__ - if ((sample > kernel_data.integrator.adaptive_min_samples) && - kernel_data.integrator.adaptive_stop_per_sample) { - const int step = kernel_data.integrator.adaptive_step; - - if ((sample & (step - 1)) == (step - 1)) { - kernel_do_adaptive_stopping(kg, buffer, sample); - } - } -#endif - } - - /* Write the sample count as negative numbers initially to mark the samples as in progress. - * Once the tile has finished rendering, the sign gets flipped and all the pixel values - * are scaled as if they were taken at a uniform sample count. */ - if (kernel_data.film.pass_sample_count) { - /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between - * passes. */ -#ifdef __ATOMIC_PASS_WRITE__ - atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count), - 0x80000000); -#else - if (buffer[kernel_data.film.pass_sample_count] > 0) { - buffer[kernel_data.film.pass_sample_count] *= -1.0f; - } -#endif - kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f); - } -} - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h deleted file mode 100644 index 92a097de9e1..00000000000 --- a/intern/cycles/kernel/kernel_path.h +++ /dev/null @@ -1,709 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef __OSL__ -# include "kernel/osl/osl_shader.h" -#endif - -// clang-format off -#include "kernel/kernel_random.h" -#include "kernel/kernel_projection.h" -#include "kernel/kernel_montecarlo.h" -#include "kernel/kernel_differential.h" -#include "kernel/kernel_camera.h" - -#include "kernel/geom/geom.h" -#include "kernel/bvh/bvh.h" - -#include "kernel/kernel_write_passes.h" -#include "kernel/kernel_accumulate.h" -#include "kernel/kernel_shader.h" -#include "kernel/kernel_light.h" -#include "kernel/kernel_adaptive_sampling.h" -#include "kernel/kernel_passes.h" - -#if defined(__VOLUME__) || defined(__SUBSURFACE__) -# include "kernel/kernel_volume.h" -#endif - -#ifdef __SUBSURFACE__ -# include "kernel/kernel_subsurface.h" -#endif - -#include "kernel/kernel_path_state.h" -#include "kernel/kernel_shadow.h" -#include "kernel/kernel_emission.h" -#include "kernel/kernel_path_common.h" -#include "kernel/kernel_path_surface.h" -#include "kernel/kernel_path_volume.h" -#include "kernel/kernel_path_subsurface.h" -// clang-format on - -CCL_NAMESPACE_BEGIN - -ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - Intersection *isect, - PathRadiance *L, - const int last_object) -{ - PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT); - - uint visibility = path_state_ray_visibility(kg, state); - - if (path_state_ao_bounce(kg, state)) { - ray->t = kernel_data.background.ao_distance; - if (last_object != OBJECT_NONE) { - const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance; - if (object_ao_distance != 0.0f) { - ray->t = object_ao_distance; - } - } - } - - bool hit = scene_intersect(kg, ray, visibility, isect); - - return hit; -} - -ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - float3 throughput, - ccl_addr_space Intersection *isect, - ShaderData *emission_sd, - PathRadiance *L) -{ - PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION); - -#ifdef __LAMP_MIS__ - if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray ccl_optional_struct_init; - - light_ray.P = ray->P - state->ray_t * ray->D; - state->ray_t += isect->t; - light_ray.D = ray->D; - light_ray.t = state->ray_t; - light_ray.time = ray->time; - light_ray.dD = ray->dD; - light_ray.dP = ray->dP; - - /* intersect with lamp */ - indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput); - } -#endif /* __LAMP_MIS__ */ -} - -ccl_device_forceinline void kernel_path_background(KernelGlobals *kg, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray, - float3 throughput, - ShaderData *sd, - ccl_global float *buffer, - PathRadiance *L) -{ - /* eval background shader if nothing hit */ - if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) { - L->transparent += average(throughput); - -#ifdef __PASSES__ - if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND))) -#endif /* __PASSES__ */ - return; - } - - /* When using the ao bounces approximation, adjust background - * shader intensity with ao factor. */ - if (path_state_ao_bounce(kg, state)) { - throughput *= kernel_data.background.ao_bounces_factor; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, sd, state, buffer, ray); - path_radiance_accum_background(kg, L, state, throughput, L_background); -#endif /* __BACKGROUND__ */ -} - -#ifndef __SPLIT_KERNEL__ - -# ifdef __VOLUME__ -ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg, - ShaderData *sd, - PathState *state, - Ray *ray, - float3 *throughput, - ccl_addr_space Intersection *isect, - bool hit, - ShaderData *emission_sd, - PathRadiance *L) -{ - PROFILING_INIT(kg, PROFILING_VOLUME); - - /* Sanitize volume stack. */ - if (!hit) { - kernel_volume_clean_stack(kg, state->volume_stack); - } - - if (state->volume_stack[0].shader == SHADER_NONE) { - return VOLUME_PATH_ATTENUATED; - } - - /* volume attenuation, emission, scatter */ - Ray volume_ray = *ray; - volume_ray.t = (hit) ? isect->t : FLT_MAX; - - float step_size = volume_stack_step_size(kg, state->volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = volume_stack_sampling_method(kg, state->volume_stack); - bool direct = (state->flag & PATH_RAY_CAMERA) != 0; - bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method); - - if (decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, sd, &volume_ray); - kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if (volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission); - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if (volume_segment.closure_flag & SD_SCATTER) { - int all = kernel_data.integrator.sample_all_lights_indirect; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light( - kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); - float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter( - kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if (result == VOLUME_PATH_SCATTERED) { - if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) - return VOLUME_PATH_SCATTERED; - else - return VOLUME_PATH_MISSED; - } - else { - *throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, step_size); - -# ifdef __VOLUME_SCATTER__ - if (result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); - - /* indirect light bounce */ - if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) - return VOLUME_PATH_SCATTERED; - else - return VOLUME_PATH_MISSED; - } -# endif /* __VOLUME_SCATTER__ */ - } - - return VOLUME_PATH_ATTENUATED; -} -# endif /* __VOLUME__ */ - -#endif /* __SPLIT_KERNEL__ */ - -ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray, - float3 throughput, - ShaderData *emission_sd, - PathRadiance *L, - ccl_global float *buffer) -{ - PROFILING_INIT(kg, PROFILING_SHADER_APPLY); - -#ifdef __SHADOW_TRICKS__ - if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) { - if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO); - - float3 bg = zero_float3(); - if (!kernel_data.background.transparent) { - bg = indirect_background(kg, emission_sd, state, NULL, ray); - } - path_radiance_accum_shadowcatcher(L, throughput, bg); - } - } - else if (state->flag & PATH_RAY_SHADOW_CATCHER) { - /* Only update transparency after shadow catcher bounce. */ - L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd)); - } -#endif /* __SHADOW_TRICKS__ */ - - /* holdout */ -#ifdef __HOLDOUT__ - if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && - (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) { - const float3 holdout_weight = shader_holdout_apply(kg, sd); - if (kernel_data.background.transparent) { - L->transparent += average(holdout_weight * throughput); - } - if (isequal_float3(holdout_weight, one_float3())) { - return false; - } - } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, L, sd, state, throughput); - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if (kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf; - - if (blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if (sd->flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission( - kg, sd, sd->ray_length, state->flag, state->ray_pdf); - path_radiance_accum_emission(kg, L, state, throughput, emission); - } -#endif /* __EMISSION__ */ - - return true; -} - -#ifdef __KERNEL_OPTIX__ -ccl_device_inline /* inline trace calls */ -#else -ccl_device_noinline -#endif - void - kernel_path_ao(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 ao_alpha) -{ - PROFILING_INIT(kg, PROFILING_AO); - - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - - path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - float3 ao_D; - float ao_pdf; - - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray light_ray; - float3 ao_shadow; - - light_ray.P = ray_offset(sd->P, sd->Ng); - light_ray.D = ao_D; - light_ray.t = kernel_data.background.ao_distance; - light_ray.time = sd->time; - light_ray.dP = sd->dP; - light_ray.dD = differential3_zero(); - - if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); - } - else { - path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); - } - } -} - -#ifndef __SPLIT_KERNEL__ - -# if defined(__BRANCHED_PATH__) || defined(__BAKING__) - -ccl_device void kernel_path_indirect(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - Ray *ray, - float3 throughput, - PathState *state, - PathRadiance *L, - const int last_object) -{ -# ifdef __SUBSURFACE__ - SubsurfaceIndirectRays ss_indirect; - kernel_path_subsurface_init_indirect(&ss_indirect); - - for (;;) { -# endif /* __SUBSURFACE__ */ - - /* path iteration */ - for (;;) { - /* Find intersection with objects in scene. */ - Intersection isect; - bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object); - - /* Find intersection with lamps and compute emission for MIS. */ - kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L); - -# ifdef __VOLUME__ - /* Volume integration. */ - VolumeIntegrateResult result = kernel_path_volume( - kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L); - - if (result == VOLUME_PATH_SCATTERED) { - continue; - } - else if (result == VOLUME_PATH_MISSED) { - break; - } -# endif /* __VOLUME__*/ - - /* Shade background. */ - if (!hit) { - kernel_path_background(kg, state, ray, throughput, sd, NULL, L); - break; - } - else if (path_state_ao_bounce(kg, state)) { - if (intersection_get_shader_flags(kg, &isect) & - (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; - } - else { - break; - } - } - - /* Setup shader data. */ - shader_setup_from_ray(kg, sd, &isect, ray); - - /* Skip most work for volume bounding surface. */ -# ifdef __VOLUME__ - if (!(sd->flag & SD_HAS_ONLY_VOLUME)) { -# endif - - /* Evaluate shader. */ - shader_eval_surface(kg, sd, state, NULL, state->flag); - shader_prepare_closures(sd, state); - - /* Apply shadow catcher, holdout, emission. */ - if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) { - break; - } - - /* path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_continuation_probability(kg, state, throughput); - - if (probability == 0.0f) { - break; - } - else if (probability != 1.0f) { - float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); - - if (terminate >= probability) - break; - - throughput /= probability; - } - -# ifdef __DENOISING_FEATURES__ - kernel_update_denoising_features(kg, sd, state, L); -# endif - -# ifdef __AO__ - /* ambient occlusion */ - if (kernel_data.integrator.use_ambient_occlusion) { - kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3()); - } -# endif /* __AO__ */ - -# ifdef __SUBSURFACE__ - /* bssrdf scatter to a different location on the same object, replacing - * the closures with a diffuse BSDF */ - if (sd->flag & SD_BSSRDF) { - if (kernel_path_subsurface_scatter( - kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) { - break; - } - } -# endif /* __SUBSURFACE__ */ - -# if defined(__EMISSION__) - int all = (kernel_data.integrator.sample_all_lights_indirect) || - (state->flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light( - kg, sd, emission_sd, state, throughput, 1.0f, L, all); -# endif /* defined(__EMISSION__) */ - -# ifdef __VOLUME__ - } -# endif - - if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray)) - break; - } - -# ifdef __SUBSURFACE__ - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if (ss_indirect.num_rays) { - kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput); - } - else { - break; - } - } -# endif /* __SUBSURFACE__ */ -} - -# endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ - -ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg, - PathState *state, - float3 throughput, - Ray *ray, - PathRadiance *L, - ccl_global float *buffer, - ShaderData *emission_sd) -{ - PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE); - - /* Shader data memory used for both volumes and surfaces, saves stack space. */ - ShaderData sd; - -# ifdef __SUBSURFACE__ - SubsurfaceIndirectRays ss_indirect; - kernel_path_subsurface_init_indirect(&ss_indirect); - - for (;;) { -# endif /* __SUBSURFACE__ */ - - /* path iteration */ - for (;;) { - /* Find intersection with objects in scene. */ - Intersection isect; - bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object); - - /* Find intersection with lamps and compute emission for MIS. */ - kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L); - -# ifdef __VOLUME__ - /* Volume integration. */ - VolumeIntegrateResult result = kernel_path_volume( - kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L); - - if (result == VOLUME_PATH_SCATTERED) { - continue; - } - else if (result == VOLUME_PATH_MISSED) { - break; - } -# endif /* __VOLUME__*/ - - /* Shade background. */ - if (!hit) { - kernel_path_background(kg, state, ray, throughput, &sd, buffer, L); - break; - } - else if (path_state_ao_bounce(kg, state)) { - if (intersection_get_shader_flags(kg, &isect) & - (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; - } - else { - break; - } - } - - /* Setup shader data. */ - shader_setup_from_ray(kg, &sd, &isect, ray); - - /* Skip most work for volume bounding surface. */ -# ifdef __VOLUME__ - if (!(sd.flag & SD_HAS_ONLY_VOLUME)) { -# endif - - /* Evaluate shader. */ - shader_eval_surface(kg, &sd, state, buffer, state->flag); - shader_prepare_closures(&sd, state); - - /* Apply shadow catcher, holdout, emission. */ - if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) { - break; - } - - /* path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_continuation_probability(kg, state, throughput); - - if (probability == 0.0f) { - break; - } - else if (probability != 1.0f) { - float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); - if (terminate >= probability) - break; - - throughput /= probability; - } - -# ifdef __DENOISING_FEATURES__ - kernel_update_denoising_features(kg, &sd, state, L); -# endif - -# ifdef __AO__ - /* ambient occlusion */ - if (kernel_data.integrator.use_ambient_occlusion) { - kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd)); - } -# endif /* __AO__ */ - -# ifdef __SUBSURFACE__ - /* bssrdf scatter to a different location on the same object, replacing - * the closures with a diffuse BSDF */ - if (sd.flag & SD_BSSRDF) { - if (kernel_path_subsurface_scatter( - kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) { - break; - } - } -# endif /* __SUBSURFACE__ */ - -# ifdef __EMISSION__ - /* direct lighting */ - kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L); -# endif /* __EMISSION__ */ - -# ifdef __VOLUME__ - } -# endif - - /* compute direct lighting and next bounce */ - if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray)) - break; - } - -# ifdef __SUBSURFACE__ - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if (ss_indirect.num_rays) { - kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput); - } - else { - break; - } - } -# endif /* __SUBSURFACE__ */ -} - -ccl_device void kernel_path_trace( - KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride) -{ - PROFILING_INIT(kg, PROFILING_RAY_SETUP); - - /* buffer offset */ - int index = offset + x + y * stride; - int pass_stride = kernel_data.film.pass_stride; - - buffer += index * pass_stride; - - if (kernel_data.film.pass_adaptive_aux_buffer) { - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w > 0.0f) { - return; - } - } - - /* Initialize random numbers and sample ray. */ - uint rng_hash; - Ray ray; - - kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); - - if (ray.t == 0.0f) { - return; - } - - /* Initialize state. */ - float3 throughput = one_float3(); - - PathRadiance L; - path_radiance_init(kg, &L); - - ShaderDataTinyStorage emission_sd_storage; - ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); - - PathState state; - path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); - -# ifdef __KERNEL_OPTIX__ - /* Force struct into local memory to avoid costly spilling on trace calls. */ - if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */ - for (int i = 0; i < sizeof(L); ++i) - reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0; -# endif - - /* Integrate. */ - kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd); - - kernel_write_result(kg, buffer, sample, &L); -} - -#endif /* __SPLIT_KERNEL__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h deleted file mode 100644 index a1ee1bc107e..00000000000 --- a/intern/cycles/kernel/kernel_path_branched.h +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __BRANCHED_PATH__ - -ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput) -{ - int num_samples = kernel_data.integrator.ao_samples; - float num_samples_inv = 1.0f / num_samples; - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - float3 ao_alpha = shader_bsdf_alpha(kg, sd); - - for (int j = 0; j < num_samples; j++) { - float bsdf_u, bsdf_v; - path_branched_rng_2D( - kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float3 ao_D; - float ao_pdf; - - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray light_ray; - float3 ao_shadow; - - light_ray.P = ray_offset(sd->P, sd->Ng); - light_ray.D = ao_D; - light_ray.t = kernel_data.background.ao_distance; - light_ray.time = sd->time; - light_ray.dP = sd->dP; - light_ray.dD = differential3_zero(); - - if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao( - kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); - } - else { - path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf); - } - } - } -} - -# ifndef __SPLIT_KERNEL__ - -# ifdef __VOLUME__ -ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg, - ShaderData *sd, - PathState *state, - Ray *ray, - float3 *throughput, - ccl_addr_space Intersection *isect, - bool hit, - ShaderData *indirect_sd, - ShaderData *emission_sd, - PathRadiance *L) -{ - /* Sanitize volume stack. */ - if (!hit) { - kernel_volume_clean_stack(kg, state->volume_stack); - } - - if (state->volume_stack[0].shader == SHADER_NONE) { - return; - } - - /* volume attenuation, emission, scatter */ - Ray volume_ray = *ray; - volume_ray.t = (hit) ? isect->t : FLT_MAX; - - float step_size = volume_stack_step_size(kg, state->volume_stack); - const int object = sd->object; - -# ifdef __VOLUME_DECOUPLED__ - /* decoupled ray marching only supported on CPU */ - if (kernel_data.integrator.volume_decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, sd, &volume_ray); - kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size); - - /* direct light sampling */ - if (volume_segment.closure_flag & SD_SCATTER) { - volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack); - - int all = kernel_data.integrator.sample_all_lights_direct; - - kernel_branched_path_volume_connect_light( - kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment); - - /* indirect light sampling */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f / num_samples; - - for (int j = 0; j < num_samples; j++) { - PathState ps = *state; - Ray pray = *ray; - float3 tp = *throughput; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - /* scatter sample. if we use distance sampling and take just one - * sample for direct and indirect light, we could share this - * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL); - float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter( - kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false); - - if (result == VOLUME_PATH_SCATTERED && - kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) { - kernel_path_indirect( - kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - } - } - } - - /* emission and transmittance */ - if (volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission); - *throughput *= volume_segment.accum_transmittance; - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* GPU: no decoupled ray marching, scatter probabilistically. */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f / num_samples; - - /* todo: we should cache the shader evaluations from stepping - * through the volume, for now we redo them multiple times */ - - for (int j = 0; j < num_samples; j++) { - PathState ps = *state; - Ray pray = *ray; - float3 tp = (*throughput) * num_samples_inv; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, sd, &volume_ray, L, &tp, step_size); - -# ifdef __VOLUME_SCATTER__ - if (result == VOLUME_PATH_SCATTERED) { - /* todo: support equiangular, MIS and all light sampling. - * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L); - - if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) { - kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - } - } -# endif /* __VOLUME_SCATTER__ */ - } - - /* todo: avoid this calculation using decoupled ray marching */ - kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput); - } -} -# endif /* __VOLUME__ */ - -/* bounce off surface and integrate indirect light */ -ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - ShaderData *sd, - ShaderData *indirect_sd, - ShaderData *emission_sd, - float3 throughput, - float num_samples_adjust, - PathState *state, - PathRadiance *L) -{ - float sum_sample_weight = 0.0f; -# ifdef __DENOISING_FEATURES__ - if (state->denoising_feature_weight > 0.0f) { - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - /* transparency is not handled here, but in outer loop */ - if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { - continue; - } - - sum_sample_weight += sc->sample_weight; - } - } - else { - sum_sample_weight = 1.0f; - } -# endif /* __DENOISING_FEATURES__ */ - - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - /* transparency is not handled here, but in outer loop */ - if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { - continue; - } - - int num_samples; - - if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) - num_samples = kernel_data.integrator.diffuse_samples; - else if (CLOSURE_IS_BSDF_BSSRDF(sc->type)) - num_samples = 1; - else if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) - num_samples = kernel_data.integrator.glossy_samples; - else - num_samples = kernel_data.integrator.transmission_samples; - - num_samples = ceil_to_int(num_samples_adjust * num_samples); - - float num_samples_inv = num_samples_adjust / num_samples; - - for (int j = 0; j < num_samples; j++) { - PathState ps = *state; - float3 tp = throughput; - Ray bsdf_ray; -# ifdef __SHADOW_TRICKS__ - float shadow_transparency = L->shadow_transparency; -# endif - - ps.rng_hash = cmj_hash(state->rng_hash, i); - - if (!kernel_branched_path_surface_bounce( - kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) { - continue; - } - - ps.rng_hash = state->rng_hash; - - kernel_path_indirect( - kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - -# ifdef __SHADOW_TRICKS__ - L->shadow_transparency = shadow_transparency; -# endif - } - } -} - -# ifdef __SUBSURFACE__ -ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, - ShaderData *sd, - ShaderData *indirect_sd, - ShaderData *emission_sd, - PathRadiance *L, - PathState *state, - Ray *ray, - float3 throughput) -{ - for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if (!CLOSURE_IS_BSSRDF(sc->type)) - continue; - - /* set up random number generator */ - uint lcg_state = lcg_state_init(state, 0x68bc21eb); - int num_samples = kernel_data.integrator.subsurface_samples * 3; - float num_samples_inv = 1.0f / num_samples; - uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i); - - /* do subsurface scatter step with copy of shader data, this will - * replace the BSSRDF with a diffuse BSDF closure */ - for (int j = 0; j < num_samples; j++) { - PathState hit_state = *state; - path_state_branch(&hit_state, j, num_samples); - hit_state.rng_hash = bssrdf_rng_hash; - - LocalIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_intersect( - kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true); - - hit_state.rng_offset += PRNG_BOUNCE_NUM; - -# ifdef __VOLUME__ - Ray volume_ray = *ray; - bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* compute lighting with the BSDF closure */ - for (int hit = 0; hit < num_hits; hit++) { - ShaderData bssrdf_sd = *sd; - Bssrdf *bssrdf = (Bssrdf *)sc; - ClosureType bssrdf_type = sc->type; - float bssrdf_roughness = bssrdf->roughness; - subsurface_scatter_multi_setup( - kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness); - -# ifdef __VOLUME__ - if (need_update_volume_stack) { - /* Setup ray from previous surface point to the new one. */ - float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng); - volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); - - for (int k = 0; k < VOLUME_STACK_SIZE; k++) { - hit_state.volume_stack[k] = state->volume_stack[k]; - } - - kernel_volume_stack_update_for_subsurface( - kg, emission_sd, &volume_ray, hit_state.volume_stack); - } -# endif /* __VOLUME__ */ - -# ifdef __EMISSION__ - /* direct light */ - if (kernel_data.integrator.use_direct_light) { - int all = (kernel_data.integrator.sample_all_lights_direct) || - (hit_state.flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light( - kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all); - } -# endif /* __EMISSION__ */ - - /* indirect light */ - kernel_branched_path_surface_indirect_light( - kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L); - } - } - } -} -# endif /* __SUBSURFACE__ */ - -ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, - uint rng_hash, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L) -{ - /* initialize */ - float3 throughput = one_float3(); - - path_radiance_init(kg, L); - - /* shader data memory used for both volumes and surfaces, saves stack space */ - ShaderData sd; - /* shader data used by emission, shadows, volume stacks, indirect path */ - ShaderDataTinyStorage emission_sd_storage; - ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); - ShaderData indirect_sd; - - PathState state; - path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); - - /* Main Loop - * Here we only handle transparency intersections from the camera ray. - * Indirect bounces are handled in kernel_branched_path_surface_indirect_light(). - */ - for (;;) { - /* Find intersection with objects in scene. */ - Intersection isect; - bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object); - -# ifdef __VOLUME__ - /* Volume integration. */ - kernel_branched_path_volume( - kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L); -# endif /* __VOLUME__ */ - - /* Shade background. */ - if (!hit) { - kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L); - break; - } - - /* Setup and evaluate shader. */ - shader_setup_from_ray(kg, &sd, &isect, &ray); - - /* Skip most work for volume bounding surface. */ -# ifdef __VOLUME__ - if (!(sd.flag & SD_HAS_ONLY_VOLUME)) { -# endif - - shader_eval_surface(kg, &sd, &state, buffer, state.flag); - shader_merge_closures(&sd); - - /* Apply shadow catcher, holdout, emission. */ - if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) { - break; - } - - /* transparency termination */ - if (state.flag & PATH_RAY_TRANSPARENT) { - /* path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_continuation_probability(kg, &state, throughput); - - if (probability == 0.0f) { - break; - } - else if (probability != 1.0f) { - float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE); - - if (terminate >= probability) - break; - - throughput /= probability; - } - } - -# ifdef __DENOISING_FEATURES__ - kernel_update_denoising_features(kg, &sd, &state, L); -# endif - -# ifdef __AO__ - /* ambient occlusion */ - if (kernel_data.integrator.use_ambient_occlusion) { - kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput); - } -# endif /* __AO__ */ - -# ifdef __SUBSURFACE__ - /* bssrdf scatter to a different location on the same object */ - if (sd.flag & SD_BSSRDF) { - kernel_branched_path_subsurface_scatter( - kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput); - } -# endif /* __SUBSURFACE__ */ - - PathState hit_state = state; - -# ifdef __EMISSION__ - /* direct light */ - if (kernel_data.integrator.use_direct_light) { - int all = (kernel_data.integrator.sample_all_lights_direct) || - (state.flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light( - kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all); - } -# endif /* __EMISSION__ */ - - /* indirect light */ - kernel_branched_path_surface_indirect_light( - kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L); - - /* continue in case of transparency */ - throughput *= shader_bsdf_transparency(kg, &sd); - - if (is_zero(throughput)) - break; - - /* Update Path State */ - path_state_next(kg, &state, LABEL_TRANSPARENT); - -# ifdef __VOLUME__ - } - else { - if (!path_state_volume_next(kg, &state)) { - break; - } - } -# endif - - ray.P = ray_offset(sd.P, -sd.Ng); - ray.t -= sd.ray_length; /* clipping works through transparent */ - -# ifdef __RAY_DIFFERENTIALS__ - ray.dP = sd.dP; - ray.dD.dx = -sd.dI.dx; - ray.dD.dy = -sd.dI.dy; -# endif /* __RAY_DIFFERENTIALS__ */ - -# ifdef __VOLUME__ - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); -# endif /* __VOLUME__ */ - } -} - -ccl_device void kernel_branched_path_trace( - KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride) -{ - /* buffer offset */ - int index = offset + x + y * stride; - int pass_stride = kernel_data.film.pass_stride; - - buffer += index * pass_stride; - - if (kernel_data.film.pass_adaptive_aux_buffer) { - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w > 0.0f) { - return; - } - } - - /* initialize random numbers and ray */ - uint rng_hash; - Ray ray; - - kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); - - /* integrate */ - PathRadiance L; - - if (ray.t != 0.0f) { - kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L); - kernel_write_result(kg, buffer, sample, &L); - } -} - -# endif /* __SPLIT_KERNEL__ */ - -#endif /* __BRANCHED_PATH__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h deleted file mode 100644 index 815767595a9..00000000000 --- a/intern/cycles/kernel/kernel_path_common.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "util/util_hash.h" - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void kernel_path_trace_setup( - KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray) -{ - float filter_u; - float filter_v; - - int num_samples = kernel_data.integrator.aa_samples; - - path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v); - - /* sample camera ray */ - - float lens_u = 0.0f, lens_v = 0.0f; - - if (kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); - - float time = 0.0f; - -#ifdef __CAMERA_MOTION__ - if (kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME); -#endif - - camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index bf601580cd0..ebb2c0df4f1 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -14,99 +14,116 @@ * limitations under the License. */ -CCL_NAMESPACE_BEGIN +#pragma once -ccl_device_inline void path_state_init(KernelGlobals *kg, - ShaderData *stack_sd, - ccl_addr_space PathState *state, - uint rng_hash, - int sample, - ccl_addr_space Ray *ray) -{ - state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND; +#include "kernel_random.h" - state->rng_hash = rng_hash; - state->rng_offset = PRNG_BASE_NUM; - state->sample = sample; - state->num_samples = kernel_data.integrator.aa_samples; - state->branch_factor = 1.0f; +CCL_NAMESPACE_BEGIN - state->bounce = 0; - state->diffuse_bounce = 0; - state->glossy_bounce = 0; - state->transmission_bounce = 0; - state->transparent_bounce = 0; +/* Initialize queues, so that the this path is considered terminated. + * Used for early outputs in the camera ray initialization, as well as initialization of split + * states for shadow catcher. */ +ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS) +{ + INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; +} -#ifdef __DENOISING_FEATURES__ - if (kernel_data.film.pass_denoising_data) { - state->flag |= PATH_RAY_STORE_SHADOW_INFO; - state->denoising_feature_weight = 1.0f; - state->denoising_feature_throughput = one_float3(); - } - else { - state->denoising_feature_weight = 0.0f; - state->denoising_feature_throughput = zero_float3(); - } -#endif /* __DENOISING_FEATURES__ */ +/* Minimalistic initialization of the path state, which is needed for early outputs in the + * integrator initialization to work. */ +ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS, + const ccl_global KernelWorkTile *ccl_restrict tile, + const int x, + const int y) +{ + const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride; - state->min_ray_pdf = FLT_MAX; - state->ray_pdf = 0.0f; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif + INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index; -#ifdef __VOLUME__ - state->volume_bounce = 0; - state->volume_bounds_bounce = 0; + path_state_init_queues(INTEGRATOR_STATE_PASS); +} - if (kernel_data.integrator.use_volumes) { - /* Initialize volume stack with volume we are inside of. */ - kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack); +/* Initialize the rest of the path state needed to continue the path integration. */ +ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS, + const int sample, + const uint rng_hash) +{ + INTEGRATOR_STATE_WRITE(path, sample) = sample; + INTEGRATOR_STATE_WRITE(path, bounce) = 0; + INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0; + INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash; + INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM; + INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | + PATH_RAY_TRANSPARENT_BACKGROUND; + INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f; + INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f; + INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX; + INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f); + + if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) { + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE; + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader; + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE; + INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE; } - else { - state->volume_stack[0].shader = SHADER_NONE; + +#ifdef __DENOISING_FEATURES__ + if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) { + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES; + INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3(); } #endif } -ccl_device_inline void path_state_next(KernelGlobals *kg, - ccl_addr_space PathState *state, - int label) +ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label) { + uint32_t flag = INTEGRATOR_STATE(path, flag); + /* ray through transparent keeps same flags from previous ray and is * not counted as a regular bounce, transparent has separate max */ if (label & LABEL_TRANSPARENT) { - state->flag |= PATH_RAY_TRANSPARENT; - state->transparent_bounce++; - if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { - state->flag |= PATH_RAY_TERMINATE_IMMEDIATE; + uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1; + + flag |= PATH_RAY_TRANSPARENT; + if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { + flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE; } if (!kernel_data.integrator.transparent_shadows) - state->flag |= PATH_RAY_MIS_SKIP; - - /* random number generator next bounce */ - state->rng_offset += PRNG_BOUNCE_NUM; + flag |= PATH_RAY_MIS_SKIP; + INTEGRATOR_STATE_WRITE(path, flag) = flag; + INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce; + /* Random number generator next bounce. */ + INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM; return; } - state->bounce++; - if (state->bounce >= kernel_data.integrator.max_bounce) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1; + if (bounce >= kernel_data.integrator.max_bounce) { + flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; } - state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP); + flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP); #ifdef __VOLUME__ if (label & LABEL_VOLUME_SCATTER) { /* volume scatter */ - state->flag |= PATH_RAY_VOLUME_SCATTER; - state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; + flag |= PATH_RAY_VOLUME_SCATTER; + flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; + if (bounce == 1) { + flag |= PATH_RAY_VOLUME_PASS; + } - state->volume_bounce++; - if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce; + if (volume_bounce >= kernel_data.integrator.max_volume_bounce) { + flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; } } else @@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, { /* surface reflection/transmission */ if (label & LABEL_REFLECT) { - state->flag |= PATH_RAY_REFLECT; - state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; + flag |= PATH_RAY_REFLECT; + flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; if (label & LABEL_DIFFUSE) { - state->diffuse_bounce++; - if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce; + if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) { + flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; } } else { - state->glossy_bounce++; - if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce; + if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) { + flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; } } } else { kernel_assert(label & LABEL_TRANSMIT); - state->flag |= PATH_RAY_TRANSMIT; + flag |= PATH_RAY_TRANSMIT; if (!(label & LABEL_TRANSMIT_TRANSPARENT)) { - state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; + flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; } - state->transmission_bounce++; - if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) { - state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce; + if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) { + flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; } } /* diffuse/glossy/singular */ if (label & LABEL_DIFFUSE) { - state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR; + flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR; } else if (label & LABEL_GLOSSY) { - state->flag |= PATH_RAY_GLOSSY; + flag |= PATH_RAY_GLOSSY; } else { kernel_assert(label & LABEL_SINGULAR); - state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP; + flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP; + } + + /* Render pass categories. */ + if (bounce == 1) { + flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS; } } - /* random number generator next bounce */ - state->rng_offset += PRNG_BOUNCE_NUM; + INTEGRATOR_STATE_WRITE(path, flag) = flag; + INTEGRATOR_STATE_WRITE(path, bounce) = bounce; -#ifdef __DENOISING_FEATURES__ - if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { - state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; - } -#endif + /* Random number generator next bounce. */ + INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM; } #ifdef __VOLUME__ -ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state) +ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS) { /* For volume bounding meshes we pass through without counting transparent * bounces, only sanity check in case self intersection gets us stuck. */ - state->volume_bounds_bounce++; - if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) { + uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1; + INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce; + if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) { return false; } /* Random number generator next bounce. */ - if (state->volume_bounds_bounce > 1) { - state->rng_offset += PRNG_BOUNCE_NUM; + if (volume_bounds_bounce > 1) { + INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM; } return true; } #endif -ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, - ccl_addr_space PathState *state) +ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS) { - uint flag = state->flag & PATH_RAY_ALL_VISIBILITY; + const uint32_t path_flag = INTEGRATOR_STATE(path, flag); - /* for visibility, diffuse/glossy are for reflection only */ - if (flag & PATH_RAY_TRANSMIT) - flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY); - /* todo: this is not supported as its own ray visibility yet */ - if (state->flag & PATH_RAY_VOLUME_SCATTER) - flag |= PATH_RAY_DIFFUSE; + uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY; - return flag; + /* For visibility, diffuse/glossy are for reflection only. */ + if (visibility & PATH_RAY_TRANSMIT) { + visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY); + } + + /* todo: this is not supported as its own ray visibility yet. */ + if (path_flag & PATH_RAY_VOLUME_SCATTER) { + visibility |= PATH_RAY_DIFFUSE; + } + + visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility); + + return visibility; } -ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg, - ccl_addr_space PathState *state, - const float3 throughput) +ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS, + const uint32_t path_flag) { - if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) { - /* Ray is to be terminated immediately. */ - return 0.0f; - } - else if (state->flag & PATH_RAY_TRANSPARENT) { + if (path_flag & PATH_RAY_TRANSPARENT) { + const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce); /* Do at least specified number of bounces without RR. */ - if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) { - return 1.0f; - } -#ifdef __SHADOW_TRICKS__ - /* Exception for shadow catcher not working correctly with RR. */ - else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { + if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) { return 1.0f; } -#endif } else { + const uint32_t bounce = INTEGRATOR_STATE(path, bounce); /* Do at least specified number of bounces without RR. */ - if (state->bounce <= kernel_data.integrator.min_bounce) { + if (bounce <= kernel_data.integrator.min_bounce) { return 1.0f; } -#ifdef __SHADOW_TRICKS__ - /* Exception for shadow catcher not working correctly with RR. */ - else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { - return 1.0f; - } -#endif } /* Probabilistic termination: use sqrt() to roughly match typical view * transform and do path termination a bit later on average. */ - return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f); + return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f); } -/* TODO(DingTo): Find more meaningful name for this */ -ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase) +ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS) { - /* Modify bounce temporarily for shader eval */ - if (increase) - state->bounce += 1; - else - state->bounce -= 1; -} - -ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state) -{ - if (state->bounce <= kernel_data.integrator.ao_bounces) { + if (!kernel_data.integrator.ao_bounces) { return false; } - int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0); + const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) - + (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1; return (bounce > kernel_data.integrator.ao_bounces); } -ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, - int branch, - int num_branches) +/* Random Number Sampling Utility Functions + * + * For each random number in each step of the path we must have a unique + * dimension to avoid using the same sequence twice. + * + * For branches in the path we must be careful not to reuse the same number + * in a sequence and offset accordingly. + */ + +/* RNG State loaded onto stack. */ +typedef struct RNGState { + uint rng_hash; + uint rng_offset; + int sample; +} RNGState; + +ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state) +{ + rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash); + rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset); + rng_state->sample = INTEGRATOR_STATE(path, sample); +} + +ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state) +{ + const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) - + INTEGRATOR_STATE(path, transparent_bounce); + + rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash); + rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces; + rng_state->sample = INTEGRATOR_STATE(path, sample); +} + +ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg, + const RNGState *rng_state, + int dimension) +{ + return path_rng_1D( + kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension); +} + +ccl_device_inline void path_state_rng_2D( + const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy) +{ + path_rng_2D( + kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy); +} + +ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg, + const RNGState *rng_state, + uint hash) +{ + /* Use a hash instead of dimension, this is not great but avoids adding + * more dimensions to each bounce which reduces quality of dimensions we + * are already using. */ + return path_rng_1D( + kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset); +} + +ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg, + const RNGState *rng_state, + int branch, + int num_branches, + int dimension) +{ + return path_rng_1D(kg, + rng_state->rng_hash, + rng_state->sample * num_branches + branch, + rng_state->rng_offset + dimension); +} + +ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg, + const RNGState *rng_state, + int branch, + int num_branches, + int dimension, + float *fx, + float *fy) +{ + path_rng_2D(kg, + rng_state->rng_hash, + rng_state->sample * num_branches + branch, + rng_state->rng_offset + dimension, + fx, + fy); +} + +/* Utility functions to get light termination value, + * since it might not be needed in many cases. + */ +ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg, + const RNGState *state) { - if (num_branches > 1) { - /* Path is splitting into a branch, adjust so that each branch - * still gets a unique sample from the same sequence. */ - state->sample = state->sample * num_branches + branch; - state->num_samples = state->num_samples * num_branches; - state->branch_factor *= num_branches; + if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) { + return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); } + return 0.0f; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h deleted file mode 100644 index 97d3f292ca3..00000000000 --- a/intern/cycles/kernel/kernel_path_subsurface.h +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright 2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __SUBSURFACE__ -# ifndef __KERNEL_CUDA__ -ccl_device -# else -ccl_device_inline -# endif - bool - kernel_path_subsurface_scatter(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray, - ccl_addr_space float3 *throughput, - ccl_addr_space SubsurfaceIndirectRays *ss_indirect) -{ - PROFILING_INIT(kg, PROFILING_SUBSURFACE); - - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - - const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); - - /* do bssrdf scatter step if we picked a bssrdf closure */ - if (sc) { - /* We should never have two consecutive BSSRDF bounces, - * the second one should be converted to a diffuse BSDF to - * avoid this. - */ - kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR)); - - uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); - - LocalIntersection ss_isect; - int num_hits = subsurface_scatter_multi_intersect( - kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false); -# ifdef __VOLUME__ - bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* Closure memory will be overwritten, so read required variables now. */ - Bssrdf *bssrdf = (Bssrdf *)sc; - ClosureType bssrdf_type = sc->type; - float bssrdf_roughness = bssrdf->roughness; - - /* compute lighting with the BSDF closure */ - for (int hit = 0; hit < num_hits; hit++) { - /* NOTE: We reuse the existing ShaderData, we assume the path - * integration loop stops when this function returns true. - */ - subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness); - - kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L); - - ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; - ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; - ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; - PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays]; - - *hit_state = *state; - *hit_ray = *ray; - *hit_tp = *throughput; - *hit_L_state = L->state; - - hit_state->rng_offset += PRNG_BOUNCE_NUM; - - if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) { -# ifdef __LAMP_MIS__ - hit_state->ray_t = 0.0f; -# endif /* __LAMP_MIS__ */ - -# ifdef __VOLUME__ - if (need_update_volume_stack) { - Ray volume_ray = *ray; - /* Setup ray from previous surface point to the new one. */ - volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t); - - kernel_volume_stack_update_for_subsurface( - kg, emission_sd, &volume_ray, hit_state->volume_stack); - } -# endif /* __VOLUME__ */ - ss_indirect->num_rays++; - } - } - return true; - } - return false; -} - -ccl_device_inline void kernel_path_subsurface_init_indirect( - ccl_addr_space SubsurfaceIndirectRays *ss_indirect) -{ - ss_indirect->num_rays = 0; -} - -ccl_device void kernel_path_subsurface_setup_indirect( - KernelGlobals *kg, - ccl_addr_space SubsurfaceIndirectRays *ss_indirect, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray, - PathRadiance *L, - ccl_addr_space float3 *throughput) -{ - /* Setup state, ray and throughput for indirect SSS rays. */ - ss_indirect->num_rays--; - - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - - *state = ss_indirect->state[ss_indirect->num_rays]; - *ray = ss_indirect->rays[ss_indirect->num_rays]; - L->state = ss_indirect->L_state[ss_indirect->num_rays]; - *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; - - state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; -} - -#endif /* __SUBSURFACE__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h deleted file mode 100644 index ba48c0bdfc4..00000000000 --- a/intern/cycles/kernel/kernel_path_surface.h +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \ - defined(__BAKING__) -/* branched path tracing: connect path directly to position on one or more lights and add it to L - */ -ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light( - KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - ccl_addr_space PathState *state, - float3 throughput, - float num_samples_adjust, - PathRadiance *L, - int sample_all_lights) -{ -# ifdef __EMISSION__ - /* sample illumination from lights to find path contribution */ - BsdfEval L_light ccl_optional_struct_init; - - int num_lights = 0; - if (kernel_data.integrator.use_direct_light) { - if (sample_all_lights) { - num_lights = kernel_data.integrator.num_all_lights; - if (kernel_data.integrator.pdf_triangles != 0.0f) { - num_lights += 1; - } - } - else { - num_lights = 1; - } - } - - for (int i = 0; i < num_lights; i++) { - /* sample one light at random */ - int num_samples = 1; - int num_all_lights = 1; - uint lamp_rng_hash = state->rng_hash; - bool double_pdf = false; - bool is_mesh_light = false; - bool is_lamp = false; - - if (sample_all_lights) { - /* lamp sampling */ - is_lamp = i < kernel_data.integrator.num_all_lights; - if (is_lamp) { - if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) { - continue; - } - num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i)); - num_all_lights = kernel_data.integrator.num_all_lights; - lamp_rng_hash = cmj_hash(state->rng_hash, i); - double_pdf = kernel_data.integrator.pdf_triangles != 0.0f; - } - /* mesh light sampling */ - else { - num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples); - double_pdf = kernel_data.integrator.num_all_lights != 0; - is_mesh_light = true; - } - } - - float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights); - - for (int j = 0; j < num_samples; j++) { - Ray light_ray ccl_optional_struct_init; - light_ray.t = 0.0f; /* reset ray */ -# ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -# endif - bool has_emission = false; - - if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) { - float light_u, light_v; - path_branched_rng_2D( - kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination( - kg, lamp_rng_hash, state, j, num_samples); - - /* only sample triangle lights */ - if (is_mesh_light && double_pdf) { - light_u = 0.5f * light_u; - } - - LightSample ls ccl_optional_struct_init; - const int lamp = is_lamp ? i : -1; - if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - /* The sampling probability returned by lamp_light_sample assumes that all lights were - * sampled. However, this code only samples lamps, so if the scene also had mesh lights, - * the real probability is twice as high. */ - if (double_pdf) { - ls.pdf *= 2.0f; - } - - has_emission = direct_emission( - kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate); - } - } - - /* trace shadow ray */ - float3 shadow; - - const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow); - - if (has_emission) { - if (!blocked) { - /* accumulate */ - path_radiance_accum_light(kg, - L, - state, - throughput * num_samples_inv, - &L_light, - shadow, - num_samples_inv, - is_lamp); - } - else { - path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light); - } - } - } - } -# endif -} - -/* branched path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, - ShaderData *sd, - const ShaderClosure *sc, - int sample, - int num_samples, - ccl_addr_space float3 *throughput, - ccl_addr_space PathState *state, - PathRadianceState *L_state, - ccl_addr_space Ray *ray, - float sum_sample_weight) -{ - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval ccl_optional_struct_init; - float3 bsdf_omega_in ccl_optional_struct_init; - differential3 bsdf_domega_in ccl_optional_struct_init; - float bsdf_u, bsdf_v; - path_branched_rng_2D( - kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample_closure( - kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); - -# ifdef __DENOISING_FEATURES__ - state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); -# endif - - /* modify path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng); - ray->D = normalize(bsdf_omega_in); - ray->t = FLT_MAX; -# ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = bsdf_domega_in; -# endif -# ifdef __OBJECT_MOTION__ - ray->time = sd->time; -# endif - -# ifdef __VOLUME__ - /* enter/exit volume */ - if (label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); -# endif - - /* branch RNG state */ - path_state_branch(state, sample, num_samples); - - /* set MIS state */ - state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX); - state->ray_pdf = bsdf_pdf; -# ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -# endif - - return true; -} - -#endif - -/* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - float3 throughput, - ccl_addr_space PathState *state, - PathRadiance *L) -{ - PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT); - -#ifdef __EMISSION__ -# ifdef __SHADOW_TRICKS__ - int all = (state->flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all); -# else - /* sample illumination from lights to find path contribution */ - Ray light_ray ccl_optional_struct_init; - BsdfEval L_light ccl_optional_struct_init; - bool is_lamp = false; - bool has_emission = false; - - light_ray.t = 0.0f; -# ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -# endif - - if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) { - float light_u, light_v; - path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); - - LightSample ls ccl_optional_struct_init; - if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, state); - has_emission = direct_emission( - kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate); - } - } - - /* trace shadow ray */ - float3 shadow; - - const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow); - - if (has_emission) { - if (!blocked) { - /* accumulate */ - path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp); - } - else { - path_radiance_accum_total_light(L, state, throughput, &L_light); - } - } -# endif -#endif -} - -/* path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space float3 *throughput, - ccl_addr_space PathState *state, - PathRadianceState *L_state, - ccl_addr_space Ray *ray) -{ - PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE); - - /* no BSDF? we can stop here */ - if (sd->flag & SD_BSDF) { - /* sample BSDF */ - float bsdf_pdf; - BsdfEval bsdf_eval ccl_optional_struct_init; - float3 bsdf_omega_in ccl_optional_struct_init; - differential3 bsdf_domega_in ccl_optional_struct_init; - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - int label; - - label = shader_bsdf_sample( - kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf); - - if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); - - /* set labels */ - if (!(label & LABEL_TRANSPARENT)) { - state->ray_pdf = bsdf_pdf; -#ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -#endif - state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf); - } - - /* update path state */ - path_state_next(kg, state, label); - - /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng); - ray->D = normalize(bsdf_omega_in); - - if (state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ - else - ray->t = FLT_MAX; - -#ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = bsdf_domega_in; -#endif - -#ifdef __VOLUME__ - /* enter/exit volume */ - if (label & LABEL_TRANSMIT) - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); -#endif - return true; - } -#ifdef __VOLUME__ - else if (sd->flag & SD_HAS_ONLY_VOLUME) { - if (!path_state_volume_next(kg, state)) { - return false; - } - - if (state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ - else - ray->t = FLT_MAX; - - /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(sd->P, -sd->Ng); -# ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; -# endif - - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); - return true; - } -#endif - else { - /* no bsdf or volume? */ - return false; - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h deleted file mode 100644 index a787910e65c..00000000000 --- a/intern/cycles/kernel/kernel_path_volume.h +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __VOLUME_SCATTER__ - -ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - float3 throughput, - ccl_addr_space PathState *state, - PathRadiance *L) -{ -# ifdef __EMISSION__ - /* sample illumination from lights to find path contribution */ - Ray light_ray ccl_optional_struct_init; - BsdfEval L_light ccl_optional_struct_init; - bool is_lamp = false; - bool has_emission = false; - - light_ray.t = 0.0f; -# ifdef __OBJECT_MOTION__ - /* connect to light from given point where shader has been evaluated */ - light_ray.time = sd->time; -# endif - - if (kernel_data.integrator.use_direct_light) { - float light_u, light_v; - path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); - - LightSample ls ccl_optional_struct_init; - if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, state); - has_emission = direct_emission( - kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate); - } - } - - /* trace shadow ray */ - float3 shadow; - - const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow); - - if (has_emission && !blocked) { - /* accumulate */ - path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp); - } -# endif /* __EMISSION__ */ -} - -ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space float3 *throughput, - ccl_addr_space PathState *state, - PathRadianceState *L_state, - ccl_addr_space Ray *ray) -{ - /* sample phase function */ - float phase_pdf; - BsdfEval phase_eval ccl_optional_struct_init; - float3 phase_omega_in ccl_optional_struct_init; - differential3 phase_domega_in ccl_optional_struct_init; - float phase_u, phase_v; - path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v); - int label; - - label = shader_volume_phase_sample( - kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf); - - if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) - return false; - - /* modify throughput */ - path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label); - - /* set labels */ - state->ray_pdf = phase_pdf; -# ifdef __LAMP_MIS__ - state->ray_t = 0.0f; -# endif - state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf); - - /* update path state */ - path_state_next(kg, state, label); - - /* Russian roulette termination of volume ray scattering. */ - float probability = path_state_continuation_probability(kg, state, *throughput); - - if (probability == 0.0f) { - return false; - } - else if (probability != 1.0f) { - /* Use dimension from the previous bounce, has not been used yet. */ - float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM); - - if (terminate >= probability) { - return false; - } - - *throughput /= probability; - } - - /* setup ray */ - ray->P = sd->P; - ray->D = phase_omega_in; - ray->t = FLT_MAX; - -# ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD = phase_domega_in; -# endif - - return true; -} - -# if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__)) -ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - float3 throughput, - ccl_addr_space PathState *state, - PathRadiance *L, - bool sample_all_lights, - Ray *ray, - const VolumeSegment *segment) -{ -# ifdef __EMISSION__ - BsdfEval L_light ccl_optional_struct_init; - - int num_lights = 1; - if (sample_all_lights) { - num_lights = kernel_data.integrator.num_all_lights; - if (kernel_data.integrator.pdf_triangles != 0.0f) { - num_lights += 1; - } - } - - for (int i = 0; i < num_lights; ++i) { - /* sample one light at random */ - int num_samples = 1; - int num_all_lights = 1; - uint lamp_rng_hash = state->rng_hash; - bool double_pdf = false; - bool is_mesh_light = false; - bool is_lamp = false; - - if (sample_all_lights) { - /* lamp sampling */ - is_lamp = i < kernel_data.integrator.num_all_lights; - if (is_lamp) { - if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) { - continue; - } - num_samples = light_select_num_samples(kg, i); - num_all_lights = kernel_data.integrator.num_all_lights; - lamp_rng_hash = cmj_hash(state->rng_hash, i); - double_pdf = kernel_data.integrator.pdf_triangles != 0.0f; - } - /* mesh light sampling */ - else { - num_samples = kernel_data.integrator.mesh_light_samples; - double_pdf = kernel_data.integrator.num_all_lights != 0; - is_mesh_light = true; - } - } - - float num_samples_inv = 1.0f / (num_samples * num_all_lights); - - for (int j = 0; j < num_samples; j++) { - Ray light_ray ccl_optional_struct_init; - light_ray.t = 0.0f; /* reset ray */ -# ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; -# endif - bool has_emission = false; - - float3 tp = throughput; - - if (kernel_data.integrator.use_direct_light) { - /* sample random position on random light/triangle */ - float light_u, light_v; - path_branched_rng_2D( - kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - - /* only sample triangle lights */ - if (is_mesh_light && double_pdf) { - light_u = 0.5f * light_u; - } - - LightSample ls ccl_optional_struct_init; - const int lamp = is_lamp ? i : -1; - light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls); - - /* sample position on volume segment */ - float rphase = path_branched_rng_1D( - kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); - float rscatter = path_branched_rng_1D( - kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - state, - ray, - sd, - &tp, - rphase, - rscatter, - segment, - (ls.t != FLT_MAX) ? &ls.P : - NULL, - false); - - if (result == VOLUME_PATH_SCATTERED) { - /* todo: split up light_sample so we don't have to call it again with new position */ - if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - if (double_pdf) { - ls.pdf *= 2.0f; - } - - /* sample random light */ - float terminate = path_branched_rng_light_termination( - kg, state->rng_hash, state, j, num_samples); - has_emission = direct_emission( - kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate); - } - } - } - - /* trace shadow ray */ - float3 shadow; - - const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow); - - if (has_emission && !blocked) { - /* accumulate */ - path_radiance_accum_light( - kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); - } - } - } -# endif /* __EMISSION__ */ -} -# endif /* __SPLIT_KERNEL__ */ - -#endif /* __VOLUME_SCATTER__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h index 780830879d8..db8644005ea 100644 --- a/intern/cycles/kernel/kernel_profiling.h +++ b/intern/cycles/kernel/kernel_profiling.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_PROFILING_H__ -#define __KERNEL_PROFILING_H__ +#pragma once #ifdef __KERNEL_CPU__ # include "util/util_profiling.h" @@ -24,23 +23,18 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_CPU__ -# define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event) +# define PROFILING_INIT(kg, event) \ + ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event) # define PROFILING_EVENT(event) profiling_helper.set_event(event) -# define PROFILING_SHADER(shader) \ - if ((shader) != SHADER_NONE) { \ - profiling_helper.set_shader((shader)&SHADER_MASK); \ - } -# define PROFILING_OBJECT(object) \ - if ((object) != PRIM_NONE) { \ - profiling_helper.set_object(object); \ - } +# define PROFILING_INIT_FOR_SHADER(kg, event) \ + ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event) +# define PROFILING_SHADER(object, shader) \ + profiling_helper.set_shader(object, (shader)&SHADER_MASK); #else # define PROFILING_INIT(kg, event) # define PROFILING_EVENT(event) -# define PROFILING_SHADER(shader) -# define PROFILING_OBJECT(object) +# define PROFILING_INIT_FOR_SHADER(kg, event) +# define PROFILING_SHADER(object, shader) #endif /* __KERNEL_CPU__ */ CCL_NAMESPACE_END - -#endif /* __KERNEL_PROFILING_H__ */ diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index c33d7150b5c..192bf7ca5aa 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -30,8 +30,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __KERNEL_PROJECTION_CL__ -#define __KERNEL_PROJECTION_CL__ +#pragma once CCL_NAMESPACE_BEGIN @@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam } CCL_NAMESPACE_END - -#endif /* __KERNEL_PROJECTION_CL__ */ diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h deleted file mode 100644 index d8cc08b3e85..00000000000 --- a/intern/cycles/kernel/kernel_queues.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KERNEL_QUEUE_H__ -#define __KERNEL_QUEUE_H__ - -CCL_NAMESPACE_BEGIN - -/* - * Queue utility functions for split kernel - */ -#ifdef __KERNEL_OPENCL__ -# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -# pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable -#endif - -/* - * Enqueue ray index into the queue - */ -ccl_device void enqueue_ray_index( - int ray_index, /* Ray index to be enqueued. */ - int queue_number, /* Queue in which the ray index should be enqueued. */ - ccl_global int *queues, /* Buffer of all queues. */ - int queue_size, /* Size of each queue. */ - ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */ -{ - /* This thread's queue index. */ - int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) + - (queue_number * queue_size); - queues[my_queue_index] = ray_index; -} - -/* - * Get the ray index for this thread - * Returns a positive ray_index for threads that have to do some work; - * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work - * i.e All ray's in the queue has been successfully allocated and there - * is no more ray to allocate to other threads. - */ -ccl_device int get_ray_index( - KernelGlobals *kg, - int thread_index, /* Global thread index. */ - int queue_number, /* Queue to operate on. */ - ccl_global int *queues, /* Buffer of all queues. */ - int queuesize, /* Size of a queue. */ - int empty_queue) /* Empty the queue slot as soon as we fetch the ray index. */ -{ - int ray_index = queues[queue_number * queuesize + thread_index]; - if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) { - queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - } - return ray_index; -} - -/* The following functions are to realize Local memory variant of enqueue ray index function. */ - -/* All threads should call this function. */ -ccl_device void enqueue_ray_index_local( - int ray_index, /* Ray index to enqueue. */ - int queue_number, /* Queue in which to enqueue ray index. */ - char enqueue_flag, /* True for threads whose ray index has to be enqueued. */ - int queuesize, /* queue size. */ - ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */ - ccl_global int *Queue_data, /* Queues. */ - ccl_global int *Queue_index) /* To do global queue atomics. */ -{ - int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); - - /* Get local queue id. */ - unsigned int lqidx; - if (enqueue_flag) { - lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics); - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - /* Get global queue offset. */ - if (lidx == 0) { - *local_queue_atomics = atomic_fetch_and_add_uint32( - (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics); - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - /* Get global queue index and enqueue ray. */ - if (enqueue_flag) { - unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx; - Queue_data[my_gqidx] = ray_index; - } -} - -ccl_device unsigned int get_local_queue_index( - int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ - ccl_local_param unsigned int *local_queue_atomics) -{ - int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]); - return my_lqidx; -} - -ccl_device unsigned int get_global_per_queue_offset( - int queue_number, - ccl_local_param unsigned int *local_queue_atomics, - ccl_global int *global_queue_atomics) -{ - unsigned int queue_offset = atomic_fetch_and_add_uint32( - (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]); - return queue_offset; -} - -ccl_device unsigned int get_global_queue_index( - int queue_number, - int queuesize, - unsigned int lqidx, - ccl_local_param unsigned int *global_per_queue_offset) -{ - int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; - return my_gqidx; -} - -ccl_device int dequeue_ray_index(int queue_number, - ccl_global int *queues, - int queue_size, - ccl_global int *queue_index) -{ - int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1; - - if (index < 0) { - return QUEUE_EMPTY_SLOT; - } - - return queues[index + queue_number * queue_size]; -} - -CCL_NAMESPACE_END - -#endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 49e5e25c2e0..41b7d76230a 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include "kernel/kernel_jitter.h" #include "util/util_hash.h" @@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN */ # define SOBOL_SKIP 64 -ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) +ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension) { uint result = 0; uint i = index + SOBOL_SKIP; for (int j = 0, x; (x = find_first_set(i)); i >>= x) { j += x; - result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1); + result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1)); } return result; } #endif /* __SOBOL__ */ -ccl_device_forceinline float path_rng_1D( - KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg, + uint rng_hash, + int sample, + int dimension) { #ifdef __DEBUG_CORRELATION__ return (float)drand48(); #endif - if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) { - return pmj_sample_1D(kg, sample, rng_hash, dimension); - } -#ifdef __CMJ__ -# ifdef __SOBOL__ - if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) -# endif + +#ifdef __SOBOL__ + if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) +#endif { - /* Correlated multi-jitter. */ - int p = rng_hash + dimension; - return cmj_sample_1D(sample, num_samples, p); + return pmj_sample_1D(kg, sample, rng_hash, dimension); } -#endif #ifdef __SOBOL__ /* Sobol sequence value using direction vectors. */ @@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D( #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, - uint rng_hash, - int sample, - int num_samples, - int dimension, - float *fx, - float *fy) +ccl_device_forceinline void path_rng_2D( + const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy) { #ifdef __DEBUG_CORRELATION__ *fx = (float)drand48(); *fy = (float)drand48(); return; #endif - if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) { - const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension); - *fx = f.x; - *fy = f.y; - return; - } -#ifdef __CMJ__ -# ifdef __SOBOL__ - if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) -# endif + +#ifdef __SOBOL__ + if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) +#endif { - /* Correlated multi-jitter. */ - int p = rng_hash + dimension; - cmj_sample_2D(sample, num_samples, p, fx, fy); + pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy); + return; } -#endif #ifdef __SOBOL__ /* Sobol. */ - *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1); + *fx = path_rng_1D(kg, rng_hash, sample, dimension); + *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1); #endif } -ccl_device_inline void path_rng_init(KernelGlobals *kg, - int sample, - int num_samples, - uint *rng_hash, - int x, - int y, - float *fx, - float *fy) +/** + * 1D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020 + * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh + * http://www.jcgt.org/published/0009/03/02/paper.pdf + */ +ccl_device_inline uint hash_iqint1(uint n) +{ + n = (n << 13U) ^ n; + n = n * (n * n * 15731U + 789221U) + 1376312589U; + + return n; +} + +/** + * 2D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020 + * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh + * http://www.jcgt.org/published/0009/03/02/paper.pdf + */ +ccl_device_inline uint hash_iqnt2d(const uint x, const uint y) { - /* load state */ - *rng_hash = hash_uint2(x, y); - *rng_hash ^= kernel_data.integrator.seed; + const uint qx = 1103515245U * ((x >> 1U) ^ (y)); + const uint qy = 1103515245U * ((y >> 1U) ^ (x)); + const uint n = 1103515245U * ((qx) ^ (qy >> 3U)); + + return n; +} + +ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg, + const int sample, + const int x, + const int y) +{ + const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed; #ifdef __DEBUG_CORRELATION__ - srand48(*rng_hash + sample); + srand48(rng_hash + sample); +#else + (void)sample; #endif - if (sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy); - } + return rng_hash; } /* Linear Congruential Generator */ @@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed) return rng; } -/* Path Tracing Utility Functions - * - * For each random number in each step of the path we must have a unique - * dimension to avoid using the same sequence twice. - * - * For branches in the path we must be careful not to reuse the same number - * in a sequence and offset accordingly. - */ - -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, - const ccl_addr_space PathState *state, - int dimension) -{ - return path_rng_1D( - kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension); -} - -ccl_device_inline void path_state_rng_2D( - KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) -{ - path_rng_2D(kg, - state->rng_hash, - state->sample, - state->num_samples, - state->rng_offset + dimension, - fx, - fy); -} - -ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg, - const ccl_addr_space PathState *state, - uint hash) -{ - /* Use a hash instead of dimension, this is not great but avoids adding - * more dimensions to each bounce which reduces quality of dimensions we - * are already using. */ - return path_rng_1D(kg, - cmj_hash_simple(state->rng_hash, hash), - state->sample, - state->num_samples, - state->rng_offset); -} - -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, - uint rng_hash, - const ccl_addr_space PathState *state, - int branch, - int num_branches, - int dimension) -{ - return path_rng_1D(kg, - rng_hash, - state->sample * num_branches + branch, - state->num_samples * num_branches, - state->rng_offset + dimension); -} - -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, - uint rng_hash, - const ccl_addr_space PathState *state, - int branch, - int num_branches, - int dimension, - float *fx, - float *fy) -{ - path_rng_2D(kg, - rng_hash, - state->sample * num_branches + branch, - state->num_samples * num_branches, - state->rng_offset + dimension, - fx, - fy); -} - -/* Utility functions to get light termination value, - * since it might not be needed in many cases. - */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, - const ccl_addr_space PathState *state) -{ - if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); - } - return 0.0f; -} - -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, - uint rng_hash, - const ccl_addr_space PathState *state, - int branch, - int num_branches) -{ - if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE); - } - return 0.0f; -} - -ccl_device_inline uint lcg_state_init(PathState *state, uint scramble) -{ - return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble); -} - -ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(const uint rng_hash, + const uint rng_offset, + const uint sample, + const uint scramble) { - return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble); + return lcg_init(rng_hash + rng_offset + sample * scramble); } ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) @@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample) return __builtin_popcount(sample & 0xaaaaaaaa) & 1; #elif defined(__NVCC__) return __popc(sample & 0xaaaaaaaa) & 1; -#elif defined(__KERNEL_OPENCL__) - return popcount(sample & 0xaaaaaaaa) & 1; #else /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */ int i = sample & 0xaaaaaaaa; diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 7f02e6fc7b3..3052bb53040 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -14,14 +14,9 @@ * limitations under the License. */ -/* - * ShaderData, used in four steps: - * - * Setup from incoming ray, sampled position and background. - * Execute for surface, volume or displacement. - * Evaluate one or more closures. - * Release. - */ +/* Functions to evaluate shaders and use the resulting shader closures. */ + +#pragma once // clang-format off #include "kernel/closure/alloc.h" @@ -30,479 +25,39 @@ #include "kernel/closure/emissive.h" // clang-format on +#include "kernel/kernel_accumulate.h" #include "kernel/svm/svm.h" -CCL_NAMESPACE_BEGIN - -/* ShaderData setup from incoming ray */ - -#ifdef __OBJECT_MOTION__ -ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) -{ - if (sd->object_flag & SD_OBJECT_MOTION) { - sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); - sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); - } - else { - sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); - sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); - } -} -#endif - -#ifdef __KERNEL_OPTIX__ -ccl_device_inline -#else -ccl_device_noinline -#endif - void - shader_setup_from_ray(KernelGlobals *kg, - ShaderData *sd, - const Intersection *isect, - const Ray *ray) -{ - PROFILING_INIT(kg, PROFILING_SHADER_SETUP); - - sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) : - isect->object; - sd->lamp = LAMP_NONE; - - sd->type = isect->type; - sd->flag = 0; - sd->object_flag = kernel_tex_fetch(__object_flag, sd->object); - - /* matrices and time */ -#ifdef __OBJECT_MOTION__ - shader_setup_object_transforms(kg, sd, ray->time); -#endif - sd->time = ray->time; - - sd->prim = kernel_tex_fetch(__prim_index, isect->prim); - sd->ray_length = isect->t; - - sd->u = isect->u; - sd->v = isect->v; - -#ifdef __HAIR__ - if (sd->type & PRIMITIVE_ALL_CURVE) { - /* curve */ - curve_shader_setup(kg, sd, isect, ray); - } - else -#endif - if (sd->type & PRIMITIVE_TRIANGLE) { - /* static triangle */ - float3 Ng = triangle_normal(kg, sd); - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); - - /* vectors */ - sd->P = triangle_refine(kg, sd, isect, ray); - sd->Ng = Ng; - sd->N = Ng; - - /* smooth normal */ - if (sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); - -#ifdef __DPDU__ - /* dPdu/dPdv */ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); -#endif - } - else { - /* motion triangle */ - motion_triangle_shader_setup(kg, sd, isect, ray, false); - } - - sd->I = -ray->D; - - sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; - - if (isect->object != OBJECT_NONE) { - /* instance transform */ - object_normal_transform_auto(kg, sd, &sd->N); - object_normal_transform_auto(kg, sd, &sd->Ng); -#ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &sd->dPdu); - object_dir_transform_auto(kg, sd, &sd->dPdv); -#endif - } - - /* backfacing test */ - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); - - if (backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; -#ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; -#endif - } - -#ifdef __RAY_DIFFERENTIALS__ - /* differentials */ - differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); - differential_incoming(&sd->dI, ray->dD); - differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); -#endif - - PROFILING_SHADER(sd->shader); - PROFILING_OBJECT(sd->object); -} - -/* ShaderData setup from BSSRDF scatter */ - -#ifdef __SUBSURFACE__ -# ifndef __KERNEL_CUDA__ -ccl_device -# else -ccl_device_inline -# endif - void - shader_setup_from_subsurface(KernelGlobals *kg, - ShaderData *sd, - const Intersection *isect, - const Ray *ray) -{ - PROFILING_INIT(kg, PROFILING_SHADER_SETUP); - - const bool backfacing = sd->flag & SD_BACKFACING; - - /* object, matrices, time, ray_length stay the same */ - sd->flag = 0; - sd->object_flag = kernel_tex_fetch(__object_flag, sd->object); - sd->prim = kernel_tex_fetch(__prim_index, isect->prim); - sd->type = isect->type; - - sd->u = isect->u; - sd->v = isect->v; - - /* fetch triangle data */ - if (sd->type == PRIMITIVE_TRIANGLE) { - float3 Ng = triangle_normal(kg, sd); - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); - - /* static triangle */ - sd->P = triangle_refine_local(kg, sd, isect, ray); - sd->Ng = Ng; - sd->N = Ng; - - if (sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); - -# ifdef __DPDU__ - /* dPdu/dPdv */ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); -# endif - } - else { - /* motion triangle */ - motion_triangle_shader_setup(kg, sd, isect, ray, true); - } - - sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; - - if (isect->object != OBJECT_NONE) { - /* instance transform */ - object_normal_transform_auto(kg, sd, &sd->N); - object_normal_transform_auto(kg, sd, &sd->Ng); -# ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &sd->dPdu); - object_dir_transform_auto(kg, sd, &sd->dPdv); -# endif - } - - /* backfacing test */ - if (backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; -# ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; -# endif - } - - /* should not get used in principle as the shading will only use a diffuse - * BSDF, but the shader might still access it */ - sd->I = sd->N; - -# ifdef __RAY_DIFFERENTIALS__ - /* differentials */ - differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); - /* don't modify dP and dI */ -# endif - - PROFILING_SHADER(sd->shader); -} -#endif - -/* ShaderData setup from position sampled on mesh */ - -ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, - ShaderData *sd, - const float3 P, - const float3 Ng, - const float3 I, - int shader, - int object, - int prim, - float u, - float v, - float t, - float time, - bool object_space, - int lamp) -{ - PROFILING_INIT(kg, PROFILING_SHADER_SETUP); - - /* vectors */ - sd->P = P; - sd->N = Ng; - sd->Ng = Ng; - sd->I = I; - sd->shader = shader; - if (prim != PRIM_NONE) - sd->type = PRIMITIVE_TRIANGLE; - else if (lamp != LAMP_NONE) - sd->type = PRIMITIVE_LAMP; - else - sd->type = PRIMITIVE_NONE; - - /* primitive */ - sd->object = object; - sd->lamp = LAMP_NONE; - /* Currently no access to bvh prim index for strand sd->prim. */ - sd->prim = prim; - sd->u = u; - sd->v = v; - sd->time = time; - sd->ray_length = t; - - sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; - sd->object_flag = 0; - if (sd->object != OBJECT_NONE) { - sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object); - -#ifdef __OBJECT_MOTION__ - shader_setup_object_transforms(kg, sd, time); - } - else if (lamp != LAMP_NONE) { - sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); - sd->ob_itfm = lamp_fetch_transform(kg, lamp, true); - sd->lamp = lamp; -#else - } - else if (lamp != LAMP_NONE) { - sd->lamp = lamp; -#endif - } - - /* transform into world space */ - if (object_space) { - object_position_transform_auto(kg, sd, &sd->P); - object_normal_transform_auto(kg, sd, &sd->Ng); - sd->N = sd->Ng; - object_dir_transform_auto(kg, sd, &sd->I); - } - - if (sd->type & PRIMITIVE_TRIANGLE) { - /* smooth normal */ - if (sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); - - if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { - object_normal_transform_auto(kg, sd, &sd->N); - } - } - - /* dPdu/dPdv */ -#ifdef __DPDU__ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); - - if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { - object_dir_transform_auto(kg, sd, &sd->dPdu); - object_dir_transform_auto(kg, sd, &sd->dPdv); - } -#endif - } - else { -#ifdef __DPDU__ - sd->dPdu = zero_float3(); - sd->dPdv = zero_float3(); -#endif - } - - /* backfacing test */ - if (sd->prim != PRIM_NONE) { - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); - - if (backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; -#ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; -#endif - } - } - -#ifdef __RAY_DIFFERENTIALS__ - /* no ray differentials here yet */ - sd->dP = differential3_zero(); - sd->dI = differential3_zero(); - sd->du = differential_zero(); - sd->dv = differential_zero(); -#endif - - PROFILING_SHADER(sd->shader); - PROFILING_OBJECT(sd->object); -} - -/* ShaderData setup for displacement */ - -ccl_device void shader_setup_from_displace( - KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v) -{ - float3 P, Ng, I = zero_float3(); - int shader; - - triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader); - - /* force smooth shading for displacement */ - shader |= SHADER_SMOOTH_NORMAL; - - shader_setup_from_sample( - kg, - sd, - P, - Ng, - I, - shader, - object, - prim, - u, - v, - 0.0f, - 0.5f, - !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED), - LAMP_NONE); -} - -/* ShaderData setup from ray into background */ - -ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, - ShaderData *sd, - const Ray *ray) -{ - PROFILING_INIT(kg, PROFILING_SHADER_SETUP); - - /* vectors */ - sd->P = ray->D; - sd->N = -ray->D; - sd->Ng = -ray->D; - sd->I = -ray->D; - sd->shader = kernel_data.background.surface_shader; - sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; - sd->object_flag = 0; - sd->time = ray->time; - sd->ray_length = 0.0f; - - sd->object = OBJECT_NONE; - sd->lamp = LAMP_NONE; - sd->prim = PRIM_NONE; - sd->u = 0.0f; - sd->v = 0.0f; - -#ifdef __DPDU__ - /* dPdu/dPdv */ - sd->dPdu = zero_float3(); - sd->dPdv = zero_float3(); -#endif - -#ifdef __RAY_DIFFERENTIALS__ - /* differentials */ - sd->dP = ray->dD; - differential_incoming(&sd->dI, sd->dP); - sd->du = differential_zero(); - sd->dv = differential_zero(); +#ifdef __OSL__ +# include "kernel/osl/osl_shader.h" #endif - /* for NDC coordinates */ - sd->ray_P = ray->P; - - PROFILING_SHADER(sd->shader); - PROFILING_OBJECT(sd->object); -} - -/* ShaderData setup from point inside volume */ - -#ifdef __VOLUME__ -ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray) -{ - PROFILING_INIT(kg, PROFILING_SHADER_SETUP); - - /* vectors */ - sd->P = ray->P; - sd->N = -ray->D; - sd->Ng = -ray->D; - sd->I = -ray->D; - sd->shader = SHADER_NONE; - sd->flag = 0; - sd->object_flag = 0; - sd->time = ray->time; - sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ - - sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */ - sd->lamp = LAMP_NONE; - sd->prim = PRIM_NONE; - sd->type = PRIMITIVE_NONE; - - sd->u = 0.0f; - sd->v = 0.0f; - -# ifdef __DPDU__ - /* dPdu/dPdv */ - sd->dPdu = zero_float3(); - sd->dPdv = zero_float3(); -# endif - -# ifdef __RAY_DIFFERENTIALS__ - /* differentials */ - sd->dP = ray->dD; - differential_incoming(&sd->dI, sd->dP); - sd->du = differential_zero(); - sd->dv = differential_zero(); -# endif - - /* for NDC coordinates */ - sd->ray_P = ray->P; - sd->ray_dP = ray->dP; - - PROFILING_SHADER(sd->shader); - PROFILING_OBJECT(sd->object); -} -#endif /* __VOLUME__ */ +CCL_NAMESPACE_BEGIN /* Merging */ -#if defined(__BRANCHED_PATH__) || defined(__VOLUME__) -ccl_device_inline void shader_merge_closures(ShaderData *sd) +#if defined(__VOLUME__) +ccl_device_inline void shader_merge_volume_closures(ShaderData *sd) { - /* merge identical closures, better when we sample a single closure at a time */ + /* Merge identical closures to save closure space with stacked volumes. */ for (int i = 0; i < sd->num_closure; i++) { ShaderClosure *sci = &sd->closure[i]; + if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) { + continue; + } + for (int j = i + 1; j < sd->num_closure; j++) { ShaderClosure *scj = &sd->closure[j]; - - if (sci->type != scj->type) + if (sci->type != scj->type) { continue; - if (!bsdf_merge(sci, scj)) + } + + const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci; + const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj; + if (!(hgi->g == hgj->g)) { continue; + } sci->weight += scj->weight; sci->sample_weight += scj->sample_weight; @@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd) } } } -#endif /* __BRANCHED_PATH__ || __VOLUME__ */ -/* Defensive sampling. */ +ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases, + const ShaderData *ccl_restrict sd) +{ + phases->num_closure = 0; + + for (int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *from_sc = &sd->closure[i]; + const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc; + + if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) { + ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure]; + + to_sc->weight = from_sc->weight; + to_sc->sample_weight = from_sc->sample_weight; + to_sc->g = from_hg->g; + phases->num_closure++; + if (phases->num_closure >= MAX_VOLUME_CLOSURE) { + break; + } + } + } +} +#endif /* __VOLUME__ */ -ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state) +ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd) { - /* We can likely also do defensive sampling at deeper bounces, particularly + /* Defensive sampling. + * + * We can likely also do defensive sampling at deeper bounces, particularly * for cases like a perfect mirror but possibly also others. This will need * a good heuristic. */ - if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) { + if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 && + sd->num_closure > 1) { float sum = 0.0f; for (int i = 0; i < sd->num_closure; i++) { @@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa } } } + + /* Filter glossy. + * + * Blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if (kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf); + + if (blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f; + + for (int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if (CLOSURE_IS_BSDF(sc->type)) { + bsdf_blur(kg, sc, blur_roughness); + } + } + } + } } /* BSDF */ -ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, - ShaderData *sd, - const float3 omega_in, - float *pdf, - const ShaderClosure *skip_sc, - BsdfEval *result_eval, - float sum_pdf, - float sum_sample_weight) +ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in) +{ + return dot(sd->N, omega_in) < 0.0f; +} + +ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags) +{ + if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) { + return false; + } + if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) { + if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) { + return true; + } + } + if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) { + if (CLOSURE_IS_BSDF_GLOSSY(type)) { + return true; + } + } + if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) { + if (CLOSURE_IS_BSDF_TRANSMISSION(type)) { + return true; + } + } + return false; +} + +ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg, + ShaderData *sd, + const float3 omega_in, + const bool is_transmission, + const ShaderClosure *skip_sc, + BsdfEval *result_eval, + float sum_pdf, + float sum_sample_weight, + const uint light_shader_flags) { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ for (int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; - if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) { - float bsdf_pdf = 0.0f; - float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); + if (sc == skip_sc) { + continue; + } + + if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) { + float bsdf_pdf = 0.0f; + float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf); - if (bsdf_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f); - sum_pdf += bsdf_pdf * sc->sample_weight; + if (bsdf_pdf != 0.0f) { + const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || + CLOSURE_IS_BSDF_BSSRDF(sc->type)); + bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f); + sum_pdf += bsdf_pdf * sc->sample_weight; + } } sum_sample_weight += sc->sample_weight; } } - *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f; -} - -#ifdef __BRANCHED_PATH__ -ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, - ShaderData *sd, - const float3 omega_in, - BsdfEval *result_eval, - float light_pdf, - bool use_mis) -{ - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - if (CLOSURE_IS_BSDF(sc->type)) { - float bsdf_pdf = 0.0f; - float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); - if (bsdf_pdf != 0.0f) { - float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f; - bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight); - } - } - } + return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f; } -#endif /* __BRANCHED_PATH__ */ #ifndef __KERNEL_CUDA__ ccl_device #else ccl_device_inline #endif - void - shader_bsdf_eval(KernelGlobals *kg, + float + shader_bsdf_eval(const KernelGlobals *kg, ShaderData *sd, const float3 omega_in, - BsdfEval *eval, - float light_pdf, - bool use_mis) + const bool is_transmission, + BsdfEval *bsdf_eval, + const uint light_shader_flags) { - PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL); - - bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass); + bsdf_eval_init(bsdf_eval, false, zero_float3()); -#ifdef __BRANCHED_PATH__ - if (kernel_data.integrator.branched) - _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis); - else -#endif - { - float pdf; - _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f); - if (use_mis) { - float weight = power_heuristic(light_pdf, pdf); - bsdf_eval_mis(eval, weight); - } - } + return _shader_bsdf_multi_eval( + kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags); } -ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu) +/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */ +ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd, + float *randu) { - /* Note the sampling here must match shader_bssrdf_pick, - * since we reuse the same random number. */ int sampled = 0; if (sd->num_closure > 1) { @@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r } } - const ShaderClosure *sc = &sd->closure[sampled]; - return CLOSURE_IS_BSDF(sc->type) ? sc : NULL; + return &sd->closure[sampled]; } -ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, - ccl_addr_space float3 *throughput, - float *randu) +/* Return weight for picked BSSRDF. */ +ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd, + const ShaderClosure *ccl_restrict bssrdf_sc) { - /* Note the sampling here must match shader_bsdf_pick, - * since we reuse the same random number. */ - int sampled = 0; + float3 weight = bssrdf_sc->weight; if (sd->num_closure > 1) { - /* Pick a BSDF or BSSRDF or based on sample weights. */ - float sum_bsdf = 0.0f; - float sum_bssrdf = 0.0f; - - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if (CLOSURE_IS_BSDF(sc->type)) { - sum_bsdf += sc->sample_weight; - } - else if (CLOSURE_IS_BSSRDF(sc->type)) { - sum_bssrdf += sc->sample_weight; - } - } - - float r = (*randu) * (sum_bsdf + sum_bssrdf); - float partial_sum = 0.0f; - + float sum = 0.0f; for (int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { - float next_sum = partial_sum + sc->sample_weight; - - if (r < next_sum) { - if (CLOSURE_IS_BSDF(sc->type)) { - *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf; - return NULL; - } - else { - *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf; - sampled = i; - - /* Rescale to reuse for direction sample, to better preserve stratification. */ - *randu = (r - partial_sum) / sc->sample_weight; - break; - } - } - - partial_sum = next_sum; + sum += sc->sample_weight; } } + weight *= sum / bssrdf_sc->sample_weight; } - const ShaderClosure *sc = &sd->closure[sampled]; - return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL; -} - -ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - float randu, - float randv, - BsdfEval *bsdf_eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) -{ - PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE); - - const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); - if (sc == NULL) { - *pdf = 0.0f; - return LABEL_NONE; - } - - /* BSSRDF should already have been handled elsewhere. */ - kernel_assert(CLOSURE_IS_BSDF(sc->type)); - - int label; - float3 eval = zero_float3(); - - *pdf = 0.0f; - label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf); - - if (*pdf != 0.0f) { - bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass); - - if (sd->num_closure > 1) { - float sweight = sc->sample_weight; - _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight); - } - } - - return label; + return weight; } -ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, +/* Sample direction for picked BSDF, and return evaluation and pdf for all + * BSDFs combined using MIS. */ +ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, @@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, differential3 *domega_in, float *pdf) { - PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE); + /* BSSRDF should already have been handled elsewhere. */ + kernel_assert(CLOSURE_IS_BSDF(sc->type)); int label; float3 eval = zero_float3(); @@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, *pdf = 0.0f; label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf); - if (*pdf != 0.0f) - bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass); + if (*pdf != 0.0f) { + const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || + CLOSURE_IS_BSDF_BSSRDF(sc->type)); + bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight); + + if (sd->num_closure > 1) { + const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in); + float sweight = sc->sample_weight; + *pdf = _shader_bsdf_multi_eval( + kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0); + } + } return label; } -ccl_device float shader_bsdf_average_roughness(ShaderData *sd) +ccl_device float shader_bsdf_average_roughness(const ShaderData *sd) { float roughness = 0.0f; float sum_weight = 0.0f; for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF(sc->type)) { /* sqrt once to undo the squaring from multiplying roughness on the @@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd) return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f; } -ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) -{ - for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if (CLOSURE_IS_BSDF(sc->type)) - bsdf_blur(kg, sc, roughness); - } -} - -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd) +ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd) { if (sd->flag & SD_HAS_ONLY_VOLUME) { return one_float3(); @@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData * } } -ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd) +ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd) { if (sd->flag & SD_TRANSPARENT) { for (int i = 0; i < sd->num_closure; i++) { @@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData * } } -ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd) { float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd); @@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) return alpha; } -ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd) { float3 eval = zero_float3(); for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) @@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) return eval; } -ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd) { float3 eval = zero_float3(); for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) return eval; } -ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd) { float3 eval = zero_float3(); for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) return eval; } -ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd) { float3 N = zero_float3(); for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) N += sc->N * fabsf(average(sc->weight)); } @@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) return (is_zero(N)) ? sd->N : normalize(N); } -ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_) +ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd) { - float3 eval = zero_float3(); float3 N = zero_float3(); for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc; - eval += sc->weight * ao_factor; N += bsdf->N * fabsf(average(sc->weight)); } } - *N_ = (is_zero(N)) ? sd->N : normalize(N); - return eval; + return (is_zero(N)) ? sd->N : normalize(N); } #ifdef __SUBSURFACE__ -ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_) +ccl_device float3 shader_bssrdf_normal(const ShaderData *sd) { - float3 eval = zero_float3(); float3 N = zero_float3(); - float texture_blur = 0.0f, weight_sum = 0.0f; for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_BSSRDF(sc->type)) { const Bssrdf *bssrdf = (const Bssrdf *)sc; float avg_weight = fabsf(average(sc->weight)); N += bssrdf->N * avg_weight; - eval += sc->weight; - texture_blur += bssrdf->texture_blur * avg_weight; - weight_sum += avg_weight; } } - if (N_) - *N_ = (is_zero(N)) ? sd->N : normalize(N); - - if (texture_blur_) - *texture_blur_ = safe_divide(texture_blur, weight_sum); - - return eval; + return (is_zero(N)) ? sd->N : normalize(N); } #endif /* __SUBSURFACE__ */ /* Constant emission optimization */ -ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval) +ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval) { int shader_index = shader & SHADER_MASK; int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags; @@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo /* Background */ -ccl_device float3 shader_background_eval(ShaderData *sd) +ccl_device float3 shader_background_eval(const ShaderData *sd) { if (sd->flag & SD_EMISSION) { return sd->closure_emission_background; @@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd) /* Emission */ -ccl_device float3 shader_emissive_eval(ShaderData *sd) +ccl_device float3 shader_emissive_eval(const ShaderData *sd) { if (sd->flag & SD_EMISSION) { return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background; @@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd) /* Holdout */ -ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd) { float3 weight = zero_float3(); @@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd) } else { for (int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if (CLOSURE_IS_HOLDOUT(sc->type)) { weight += sc->weight; } @@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_global float *buffer, +template<uint node_feature_mask> +ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *ccl_restrict sd, + ccl_global float *ccl_restrict buffer, int path_flag) { - PROFILING_INIT(kg, PROFILING_SHADER_EVAL); - /* If path is being terminated, we are tracing a shadow ray or evaluating * emission, then we don't need to store closures. The emission and shadow * shader data also do not have a closure array to save GPU memory. */ @@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, max_closures = 0; } else { - max_closures = kernel_data.integrator.max_closures; + max_closures = kernel_data.max_closures; } sd->num_closure = 0; @@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, #ifdef __OSL__ if (kg->osl) { if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) { - OSLShader::eval_background(kg, sd, state, path_flag); + OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag); } else { - OSLShader::eval_surface(kg, sd, state, path_flag); + OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag); } } else #endif { #ifdef __SVM__ - svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag); + svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>( + INTEGRATOR_STATE_PASS, sd, buffer, path_flag); #else if (sd->object == OBJECT_NONE) { sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f); @@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, #endif } - if (sd->flag & SD_BSDF_NEEDS_LCG) { - sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953); + if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) { + sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash), + INTEGRATOR_STATE(path, rng_offset), + INTEGRATOR_STATE(path, sample), + 0xb4bc3953); } } @@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, #ifdef __VOLUME__ -ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, - const float3 omega_in, - float *pdf, - int skip_phase, - BsdfEval *result_eval, - float sum_pdf, - float sum_sample_weight) +ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd, + const ShaderVolumePhases *phases, + const float3 omega_in, + int skip_phase, + BsdfEval *result_eval, + float sum_pdf, + float sum_sample_weight) { - for (int i = 0; i < sd->num_closure; i++) { + for (int i = 0; i < phases->num_closure; i++) { if (i == skip_phase) continue; - const ShaderClosure *sc = &sd->closure[i]; - - if (CLOSURE_IS_PHASE(sc->type)) { - float phase_pdf = 0.0f; - float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf); + const ShaderVolumeClosure *svc = &phases->closure[i]; + float phase_pdf = 0.0f; + float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf); - if (phase_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval, 1.0f); - sum_pdf += phase_pdf * sc->sample_weight; - } - - sum_sample_weight += sc->sample_weight; + if (phase_pdf != 0.0f) { + bsdf_eval_accum(result_eval, false, eval, 1.0f); + sum_pdf += phase_pdf * svc->sample_weight; } + + sum_sample_weight += svc->sample_weight; } - *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f; + return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f; } -ccl_device void shader_volume_phase_eval( - KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf) +ccl_device float shader_volume_phase_eval(const KernelGlobals *kg, + const ShaderData *sd, + const ShaderVolumePhases *phases, + const float3 omega_in, + BsdfEval *phase_eval) { - PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL); + bsdf_eval_init(phase_eval, false, zero_float3()); - bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass); - - _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f); + return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f); } -ccl_device int shader_volume_phase_sample(KernelGlobals *kg, +ccl_device int shader_volume_phase_sample(const KernelGlobals *kg, const ShaderData *sd, + const ShaderVolumePhases *phases, float randu, float randv, BsdfEval *phase_eval, @@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, differential3 *domega_in, float *pdf) { - PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE); - int sampled = 0; - if (sd->num_closure > 1) { + if (phases->num_closure > 1) { /* pick a phase closure based on sample weights */ float sum = 0.0f; - for (sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; - - if (CLOSURE_IS_PHASE(sc->type)) - sum += sc->sample_weight; + for (sampled = 0; sampled < phases->num_closure; sampled++) { + const ShaderVolumeClosure *svc = &phases->closure[sampled]; + sum += svc->sample_weight; } float r = randu * sum; float partial_sum = 0.0f; - for (sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; + for (sampled = 0; sampled < phases->num_closure; sampled++) { + const ShaderVolumeClosure *svc = &phases->closure[sampled]; + float next_sum = partial_sum + svc->sample_weight; - if (CLOSURE_IS_PHASE(sc->type)) { - float next_sum = partial_sum + sc->sample_weight; - - if (r <= next_sum) { - /* Rescale to reuse for BSDF direction sample. */ - randu = (r - partial_sum) / sc->sample_weight; - break; - } - - partial_sum = next_sum; + if (r <= next_sum) { + /* Rescale to reuse for BSDF direction sample. */ + randu = (r - partial_sum) / svc->sample_weight; + break; } + + partial_sum = next_sum; } - if (sampled == sd->num_closure) { + if (sampled == phases->num_closure) { *pdf = 0.0f; return LABEL_NONE; } @@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, /* todo: this isn't quite correct, we don't weight anisotropy properly * depending on color channels, even if this is perhaps not a common case */ - const ShaderClosure *sc = &sd->closure[sampled]; + const ShaderVolumeClosure *svc = &phases->closure[sampled]; int label; float3 eval = zero_float3(); *pdf = 0.0f; - label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf); + label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf); if (*pdf != 0.0f) { - bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass); + bsdf_eval_init(phase_eval, false, eval); } return label; } -ccl_device int shader_phase_sample_closure(KernelGlobals *kg, +ccl_device int shader_phase_sample_closure(const KernelGlobals *kg, const ShaderData *sd, - const ShaderClosure *sc, + const ShaderVolumeClosure *sc, float randu, float randv, BsdfEval *phase_eval, @@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, differential3 *domega_in, float *pdf) { - PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE); - int label; float3 eval = zero_float3(); @@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf); if (*pdf != 0.0f) - bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass); + bsdf_eval_init(phase_eval, false, eval); return label; } /* Volume Evaluation */ -ccl_device_inline void shader_eval_volume(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_addr_space VolumeStack *stack, - int path_flag) +template<typename StackReadOp> +ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *ccl_restrict sd, + const int path_flag, + StackReadOp stack_read) { /* If path is being terminated, we are tracing a shadow ray or evaluating * emission, then we don't need to store closures. The emission and shadow @@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, max_closures = 0; } else { - max_closures = kernel_data.integrator.max_closures; + max_closures = kernel_data.max_closures; } /* reset closures once at the start, we will be accumulating the closures @@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, sd->num_closure_left = max_closures; sd->flag = 0; sd->object_flag = 0; - sd->type = PRIMITIVE_VOLUME; - for (int i = 0; stack[i].shader != SHADER_NONE; i++) { + for (int i = 0;; i++) { + const VolumeStack entry = stack_read(i); + if (entry.shader == SHADER_NONE) { + break; + } + /* setup shaderdata from stack. it's mostly setup already in * shader_setup_from_volume, this switching should be quick */ - sd->object = stack[i].object; + sd->object = entry.object; sd->lamp = LAMP_NONE; - sd->shader = stack[i].shader; + sd->shader = entry.shader; sd->flag &= ~SD_SHADER_FLAGS; sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; @@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, # ifdef __SVM__ # ifdef __OSL__ if (kg->osl) { - OSLShader::eval_volume(kg, sd, state, path_flag); + OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag); } else # endif { - svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag); + svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>( + INTEGRATOR_STATE_PASS, sd, NULL, path_flag); } # endif - /* merge closures to avoid exceeding number of closures limit */ + /* Merge closures to avoid exceeding number of closures limit. */ if (i > 0) - shader_merge_closures(sd); + shader_merge_volume_closures(sd); } } @@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, /* Displacement Evaluation */ -ccl_device void shader_eval_displacement(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state) +ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd) { sd->num_closure = 0; sd->num_closure_left = 0; @@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, #ifdef __SVM__ # ifdef __OSL__ if (kg->osl) - OSLShader::eval_displacement(kg, sd, state); + OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd); else # endif { - svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0); + svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>( + INTEGRATOR_STATE_PASS, sd, NULL, 0); } #endif } @@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, /* Transparent Shadows */ #ifdef __TRANSPARENT_SHADOWS__ -ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect) +ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect) { - int prim = kernel_tex_fetch(__prim_index, isect->prim); - int shader = 0; - -# ifdef __HAIR__ - if (isect->type & PRIMITIVE_ALL_TRIANGLE) { -# endif - shader = kernel_tex_fetch(__tri_shader, prim); -# ifdef __HAIR__ - } - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } -# endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0; + return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0; } #endif /* __TRANSPARENT_SHADOWS__ */ -ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader) +ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader) { return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id; } diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h deleted file mode 100644 index 3b124122fba..00000000000 --- a/intern/cycles/kernel/kernel_shadow.h +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __VOLUME__ -/* Get PathState ready for use for volume stack evaluation. */ -# ifdef __SPLIT_KERNEL__ -ccl_addr_space -# endif - ccl_device_inline PathState * - shadow_blocked_volume_path_state(KernelGlobals *kg, - VolumeState *volume_state, - ccl_addr_space PathState *state, - ShaderData *sd, - Ray *ray) -{ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space PathState *ps = - &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; -# else - PathState *ps = &volume_state->ps; -# endif - *ps = *state; - /* We are checking for shadow on the "other" side of the surface, so need - * to discard volume we are currently at. - */ - if (dot(sd->Ng, ray->D) < 0.0f) { - kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack); - } - return ps; -} -#endif /* __VOLUME__ */ - -/* Attenuate throughput accordingly to the given intersection event. - * Returns true if the throughput is zero and traversal can be aborted. - */ -ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, -#ifdef __VOLUME__ - ccl_addr_space PathState *volume_state, -#endif - Intersection *isect, - Ray *ray, - float3 *throughput) -{ -#ifdef __VOLUME__ - /* Attenuation between last surface and next surface. */ - if (volume_state->volume_stack[0].shader != SHADER_NONE) { - Ray segment_ray = *ray; - segment_ray.t = isect->t; - kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput); - } -#endif - /* Setup shader data at surface. */ - shader_setup_from_ray(kg, shadow_sd, isect, ray); - /* Attenuation from transparent surface. */ - if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { - path_state_modify_bounce(state, true); - shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW); - path_state_modify_bounce(state, false); - *throughput *= shader_bsdf_transparency(kg, shadow_sd); - } - /* Stop if all light is blocked. */ - if (is_zero(*throughput)) { - return true; - } -#ifdef __VOLUME__ - /* Exit/enter volume. */ - kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack); -#endif - return false; -} - -/* Special version which only handles opaque shadows. */ -ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - const uint visibility, - Ray *ray, - Intersection *isect, - float3 *shadow) -{ - const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect); -#ifdef __VOLUME__ - if (!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader. */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); - } -#endif - return blocked; -} - -#ifdef __TRANSPARENT_SHADOWS__ -# ifdef __SHADOW_RECORD_ALL__ -/* Shadow function to compute how much light is blocked, - * - * We trace a single ray. If it hits any opaque surface, or more than a given - * number of transparent surfaces is hit, then we consider the geometry to be - * entirely blocked. If not, all transparent surfaces will be recorded and we - * will shade them one by one to determine how much light is blocked. This all - * happens in one scene intersection function. - * - * Recording all hits works well in some cases but may be slower in others. If - * we have many semi-transparent hairs, one intersection may be faster because - * you'd be reinteresecting the same hairs a lot with each step otherwise. If - * however there is mostly binary transparency then we may be recording many - * unnecessary intersections when one of the first surfaces blocks all light. - * - * From tests in real scenes it seems the performance loss is either minimal, - * or there is a performance increase anyway due to avoiding the need to send - * two rays with transparent shadows. - * - * On CPU it'll handle all transparent bounces (by allocating storage for - * intersections when they don't fit into the stack storage). - * - * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this - * is something to be kept an eye on. - */ - -# define SHADOW_STACK_MAX_HITS 64 - -/* Actual logic with traversal loop implementation which is free from device - * specific tweaks. - * - * Note that hits array should be as big as max_hits+1. - */ -ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, - ShaderData *sd, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - const uint visibility, - Ray *ray, - Intersection *hits, - uint max_hits, - float3 *shadow) -{ - /* Intersect to find an opaque surface, or record all transparent - * surface hits. - */ - uint num_hits; - const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits); -# ifdef __VOLUME__ -# ifdef __KERNEL_OPTIX__ - VolumeState &volume_state = kg->volume_state; -# else - VolumeState volume_state; -# endif -# endif - /* If no opaque surface found but we did find transparent hits, - * shade them. - */ - if (!blocked && num_hits > 0) { - float3 throughput = one_float3(); - float3 Pend = ray->P + ray->D * ray->t; - float last_t = 0.0f; - int bounce = state->transparent_bounce; - Intersection *isect = hits; -# ifdef __VOLUME__ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space -# endif - PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray); -# endif - sort_intersections(hits, num_hits); - for (int hit = 0; hit < num_hits; hit++, isect++) { - /* Adjust intersection distance for moving ray forward. */ - float new_t = isect->t; - isect->t -= last_t; - /* Skip hit if we did not move forward, step by step raytracing - * would have skipped it as well then. - */ - if (last_t == new_t) { - continue; - } - last_t = new_t; - /* Attenuate the throughput. */ - if (shadow_handle_transparent_isect(kg, - shadow_sd, - state, -# ifdef __VOLUME__ - ps, -# endif - isect, - ray, - &throughput)) { - return true; - } - /* Move ray forward. */ - ray->P = shadow_sd->P; - if (ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - bounce++; - } -# ifdef __VOLUME__ - /* Attenuation for last line segment towards light. */ - if (ps->volume_stack[0].shader != SHADER_NONE) { - kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); - } -# endif - *shadow = throughput; - return is_zero(throughput); - } -# ifdef __VOLUME__ - if (!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader. */ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space -# endif - PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray); - kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); - } -# endif - return blocked; -} - -/* Here we do all device specific trickery before invoking actual traversal - * loop to help readability of the actual logic. - */ -ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, - ShaderData *sd, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - const uint visibility, - Ray *ray, - uint max_hits, - float3 *shadow) -{ -# ifdef __SPLIT_KERNEL__ - Intersection hits_[SHADOW_STACK_MAX_HITS]; - Intersection *hits = &hits_[0]; -# elif defined(__KERNEL_CUDA__) - Intersection *hits = kg->hits_stack; -# else - Intersection hits_stack[SHADOW_STACK_MAX_HITS]; - Intersection *hits = hits_stack; -# endif -# ifndef __KERNEL_GPU__ - /* Prefer to use stack but use dynamic allocation if too deep max hits - * we need max_hits + 1 storage space due to the logic in - * scene_intersect_shadow_all which will first store and then check if - * the limit is exceeded. - * - * Ignore this on GPU because of slow/unavailable malloc(). - */ - if (max_hits + 1 > SHADOW_STACK_MAX_HITS) { - if (kg->transparent_shadow_intersections == NULL) { - const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; - kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) * - (transparent_max_bounce + 1)); - } - hits = kg->transparent_shadow_intersections; - } -# endif /* __KERNEL_GPU__ */ - /* Invoke actual traversal. */ - return shadow_blocked_transparent_all_loop( - kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow); -} -# endif /* __SHADOW_RECORD_ALL__ */ - -# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__) -/* Shadow function to compute how much light is blocked, - * - * Here we raytrace from one transparent surface to the next step by step. - * To minimize overhead in cases where we don't need transparent shadows, we - * first trace a regular shadow ray. We check if the hit primitive was - * potentially transparent, and only in that case start marching. this gives - * one extra ray cast for the cases were we do want transparency. - */ - -/* This function is only implementing device-independent traversal logic - * which requires some precalculation done. - */ -ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg, - ShaderData *sd, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - const uint visibility, - Ray *ray, - Intersection *isect, - const bool blocked, - const bool is_transparent_isect, - float3 *shadow) -{ -# ifdef __VOLUME__ -# ifdef __KERNEL_OPTIX__ - VolumeState &volume_state = kg->volume_state; -# else - VolumeState volume_state; -# endif -# endif - if (blocked && is_transparent_isect) { - float3 throughput = one_float3(); - float3 Pend = ray->P + ray->D * ray->t; - int bounce = state->transparent_bounce; -# ifdef __VOLUME__ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space -# endif - PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray); -# endif - for (;;) { - if (bounce >= kernel_data.integrator.transparent_max_bounce) { - return true; - } - if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) { - break; - } - if (!shader_transparent_shadow(kg, isect)) { - return true; - } - /* Attenuate the throughput. */ - if (shadow_handle_transparent_isect(kg, - shadow_sd, - state, -# ifdef __VOLUME__ - ps, -# endif - isect, - ray, - &throughput)) { - return true; - } - /* Move ray forward. */ - ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); - if (ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - bounce++; - } -# ifdef __VOLUME__ - /* Attenuation for last line segment towards light. */ - if (ps->volume_stack[0].shader != SHADER_NONE) { - kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); - } -# endif - *shadow *= throughput; - return is_zero(throughput); - } -# ifdef __VOLUME__ - if (!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader. */ -# ifdef __SPLIT_KERNEL__ - ccl_addr_space -# endif - PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray); - kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); - } -# endif - return blocked; -} - -ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg, - ShaderData *sd, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - const uint visibility, - Ray *ray, - Intersection *isect, - float3 *shadow) -{ - bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect); - bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false; - return shadow_blocked_transparent_stepped_loop( - kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow); -} - -# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ -#endif /* __TRANSPARENT_SHADOWS__ */ - -ccl_device_inline bool shadow_blocked(KernelGlobals *kg, - ShaderData *sd, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - Ray *ray, - float3 *shadow) -{ - *shadow = one_float3(); -#if !defined(__KERNEL_OPTIX__) - /* Some common early checks. - * Avoid conditional trace call in OptiX though, since those hurt performance there. - */ - if (ray->t == 0.0f) { - return false; - } -#endif -#ifdef __SHADOW_TRICKS__ - const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER : - PATH_RAY_SHADOW; -#else - const uint visibility = PATH_RAY_SHADOW; -#endif - /* Do actual shadow shading. - * First of all, we check if integrator requires transparent shadows. - * if not, we use simplest and fastest ever way to calculate occlusion. - * Do not do this in OptiX to avoid the additional trace call. - */ -#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__) - Intersection isect; -# ifdef __TRANSPARENT_SHADOWS__ - if (!kernel_data.integrator.transparent_shadows) -# endif - { - return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow); - } -#endif -#ifdef __TRANSPARENT_SHADOWS__ -# ifdef __SHADOW_RECORD_ALL__ - /* For the transparent shadows we try to use record-all logic on the - * devices which supports this. - */ - const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; - /* Check transparent bounces here, for volume scatter which can do - * lighting before surface path termination is checked. - */ - if (state->transparent_bounce >= transparent_max_bounce) { - return true; - } - uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; -# if defined(__KERNEL_OPTIX__) - /* Always use record-all behavior in OptiX, but ensure there are no out of bounds - * accesses to the hit stack. - */ - max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1); -# elif defined(__KERNEL_GPU__) - /* On GPU we do tricky with tracing opaque ray first, this avoids speed - * regressions in some files. - * - * TODO(sergey): Check why using record-all behavior causes slowdown in such - * cases. Could that be caused by a higher spill pressure? - */ - const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect); - const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false; - if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) { - return shadow_blocked_transparent_stepped_loop( - kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow); - } -# endif /* __KERNEL_GPU__ */ - return shadow_blocked_transparent_all( - kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow); -# else /* __SHADOW_RECORD_ALL__ */ - /* Fallback to a slowest version which works on all devices. */ - return shadow_blocked_transparent_stepped( - kg, sd, shadow_sd, state, visibility, ray, &isect, shadow); -# endif /* __SHADOW_RECORD_ALL__ */ -#endif /* __TRANSPARENT_SHADOWS__ */ -} - -#undef SHADOW_STACK_MAX_HITS - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h new file mode 100644 index 00000000000..824749818a4 --- /dev/null +++ b/intern/cycles/kernel/kernel_shadow_catcher.h @@ -0,0 +1,116 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "kernel/integrator/integrator_state_util.h" +#include "kernel/kernel_path_state.h" + +CCL_NAMESPACE_BEGIN + +/* Check whether current surface bounce is where path is to be split for the shadow catcher. */ +ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS, + const int object_flag) +{ +#ifdef __SHADOW_CATCHER__ + if (!kernel_data.integrator.has_shadow_catcher) { + return false; + } + + /* Check the flag first, avoiding fetches form global memory. */ + if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) { + return false; + } + if (object_flag & SD_OBJECT_HOLDOUT_MASK) { + return false; + } + + const int path_flag = INTEGRATOR_STATE(path, flag); + + if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) { + /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular + * object. */ + return false; + } + + if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) { + return false; + } + + return true; +#else + (void)object_flag; + return false; +#endif +} + +/* Check whether the current path can still split. */ +ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS) +{ + if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) { + return false; + } + + const int path_flag = INTEGRATOR_STATE(path, flag); + + if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) { + /* Shadow catcher was already hit and the state was split. No further split is allowed. */ + return false; + } + + return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0; +} + +/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths + * after this function. */ +ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags) +{ +#ifdef __SHADOW_CATCHER__ + + if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) { + return false; + } + + /* The split is to be done. Mark the current state as such, so that it stops contributing to the + * shadow catcher matte pass, but keeps contributing to the combined pass. */ + INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT; + + /* Split new state from the current one. This new state will only track contribution of shadow + * catcher objects ignoring non-catcher objects. */ + integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS); + + return true; +#else + (void)object_flags; + return false; +#endif +} + +#ifdef __SHADOW_CATCHER__ + +ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS) +{ + return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0; +} + +ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS) +{ + return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS; +} + +#endif /* __SHADOW_CATCHER__ */ + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h deleted file mode 100644 index 677504a4045..00000000000 --- a/intern/cycles/kernel/kernel_subsurface.h +++ /dev/null @@ -1,724 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* BSSRDF using disk based importance sampling. - * - * BSSRDF Importance Sampling, SIGGRAPH 2013 - * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf - */ - -ccl_device_inline float3 -subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all) -{ - /* This is the Veach one-sample model with balance heuristic, some pdf - * factors drop out when using balance heuristic weighting. For branched - * path tracing (all) we sample all closure and don't use MIS. */ - float3 eval_sum = zero_float3(); - float pdf_sum = 0.0f; - float sample_weight_inv = 0.0f; - - if (!all) { - float sample_weight_sum = 0.0f; - - for (int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if (CLOSURE_IS_DISK_BSSRDF(sc->type)) { - sample_weight_sum += sc->sample_weight; - } - } - - sample_weight_inv = 1.0f / sample_weight_sum; - } - - for (int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if (CLOSURE_IS_DISK_BSSRDF(sc->type)) { - /* in case of branched path integrate we sample all bssrdf's once, - * for path trace we pick one, so adjust pdf for that */ - float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv; - - /* compute pdf */ - float3 eval = bssrdf_eval(sc, r); - float pdf = bssrdf_pdf(sc, disk_r); - - eval_sum += sc->weight * eval; - pdf_sum += sample_weight * pdf; - } - } - - return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3(); -} - -ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd, - const ShaderClosure *sc, - float3 throughput, - bool all) -{ - /* This is the Veach one-sample model with balance heuristic, some pdf - * factors drop out when using balance heuristic weighting. For branched - * path tracing (all) we sample all closure and don't use MIS. */ - if (!all) { - float bssrdf_weight = 0.0f; - float weight = sc->sample_weight; - - for (int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if (CLOSURE_IS_BSSRDF(sc->type)) { - bssrdf_weight += sc->sample_weight; - } - } - throughput *= bssrdf_weight / weight; - } - return throughput; -} - -/* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf( - KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N) -{ - sd->flag &= ~SD_CLOSURE_FLAGS; - sd->num_closure = 0; - sd->num_closure_left = kernel_data.integrator.max_closures; - -#ifdef __PRINCIPLED__ - if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) { - PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc( - sd, sizeof(PrincipledDiffuseBsdf), weight); - - if (bsdf) { - bsdf->N = N; - bsdf->roughness = roughness; - sd->flag |= bsdf_principled_diffuse_setup(bsdf); - - /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes - * can recognize it as not being a regular Disney principled diffuse closure */ - bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; - } - } - else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type)) -#endif /* __PRINCIPLED__ */ - { - DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); - - if (bsdf) { - bsdf->N = N; - sd->flag |= bsdf_diffuse_setup(bsdf); - - /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes - * can recognize it as not being a regular diffuse closure */ - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; - } - } -} - -/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */ -ccl_device float3 subsurface_color_pow(float3 color, float exponent) -{ - color = max(color, zero_float3()); - - if (exponent == 1.0f) { - /* nothing to do */ - } - else if (exponent == 0.5f) { - color.x = sqrtf(color.x); - color.y = sqrtf(color.y); - color.z = sqrtf(color.z); - } - else { - color.x = powf(color.x, exponent); - color.y = powf(color.y, exponent); - color.z = powf(color.z, exponent); - } - - return color; -} - -ccl_device void subsurface_color_bump_blur( - KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N) -{ - /* average color and texture blur at outgoing point */ - float texture_blur; - float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur); - - /* do we have bump mapping? */ - bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0; - - if (bump || texture_blur > 0.0f) { - /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, state, NULL, state->flag); - float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL); - - /* we simply divide out the average color and multiply with the average - * of the other one. we could try to do this per closure but it's quite - * tricky to match closures between shader evaluations, their number and - * order may change, this is simpler */ - if (texture_blur > 0.0f) { - out_color = subsurface_color_pow(out_color, texture_blur); - in_color = subsurface_color_pow(in_color, texture_blur); - - *eval *= safe_divide_color(in_color, out_color); - } - } -} - -/* Subsurface scattering step, from a point on the surface to other - * nearby points on the same object. - */ -ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg, - LocalIntersection *ss_isect, - ShaderData *sd, - const ShaderClosure *sc, - uint *lcg_state, - float disk_u, - float disk_v, - bool all) -{ - /* pick random axis in local frame and point on disk */ - float3 disk_N, disk_T, disk_B; - float pick_pdf_N, pick_pdf_T, pick_pdf_B; - - disk_N = sd->Ng; - make_orthonormals(disk_N, &disk_T, &disk_B); - - if (disk_v < 0.5f) { - pick_pdf_N = 0.5f; - pick_pdf_T = 0.25f; - pick_pdf_B = 0.25f; - disk_v *= 2.0f; - } - else if (disk_v < 0.75f) { - float3 tmp = disk_N; - disk_N = disk_T; - disk_T = tmp; - pick_pdf_N = 0.25f; - pick_pdf_T = 0.5f; - pick_pdf_B = 0.25f; - disk_v = (disk_v - 0.5f) * 4.0f; - } - else { - float3 tmp = disk_N; - disk_N = disk_B; - disk_B = tmp; - pick_pdf_N = 0.25f; - pick_pdf_T = 0.25f; - pick_pdf_B = 0.5f; - disk_v = (disk_v - 0.75f) * 4.0f; - } - - /* sample point on disk */ - float phi = M_2PI_F * disk_v; - float disk_height, disk_r; - - bssrdf_sample(sc, disk_u, &disk_r, &disk_height); - - float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B; - - /* create ray */ -#ifdef __SPLIT_KERNEL__ - Ray ray_object = ss_isect->ray; - Ray *ray = &ray_object; -#else - Ray *ray = &ss_isect->ray; -#endif - ray->P = sd->P + disk_N * disk_height + disk_P; - ray->D = -disk_N; - ray->t = 2.0f * disk_height; - ray->dP = sd->dP; - ray->dD = differential3_zero(); - ray->time = sd->time; - - /* intersect with the same object. if multiple intersections are found it - * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ - scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS); - int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS); - - for (int hit = 0; hit < num_eval_hits; hit++) { - /* Quickly retrieve P and Ng without setting up ShaderData. */ - float3 hit_P; - if (sd->type & PRIMITIVE_TRIANGLE) { - hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray); - } -#ifdef __OBJECT_MOTION__ - else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) { - float3 verts[3]; - motion_triangle_vertices(kg, - sd->object, - kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim), - sd->time, - verts); - hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts); - } -#endif /* __OBJECT_MOTION__ */ - else { - ss_isect->weight[hit] = zero_float3(); - continue; - } - - float3 hit_Ng = ss_isect->Ng[hit]; - if (ss_isect->hits[hit].object != OBJECT_NONE) { - object_normal_transform(kg, sd, &hit_Ng); - } - - /* Probability densities for local frame axes. */ - float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng)); - float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng)); - float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng)); - - /* Multiple importance sample between 3 axes, power heuristic - * found to be slightly better than balance heuristic. pdf_N - * in the MIS weight and denominator cancelled out. */ - float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B)); - if (ss_isect->num_hits > BSSRDF_MAX_HITS) { - w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS; - } - - /* Real distance to sampled point. */ - float r = len(hit_P - sd->P); - - /* Evaluate profiles. */ - float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w; - - ss_isect->weight[hit] = eval; - } - -#ifdef __SPLIT_KERNEL__ - ss_isect->ray = *ray; -#endif - - return num_eval_hits; -} - -#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) -ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg, - LocalIntersection *ss_isect, - int hit, - ShaderData *sd, - ccl_addr_space PathState *state, - ClosureType type, - float roughness) -{ - optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness); -} -extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup( -#else -ccl_device_noinline void subsurface_scatter_multi_setup( -#endif - KernelGlobals *kg, - LocalIntersection *ss_isect, - int hit, - ShaderData *sd, - ccl_addr_space PathState *state, - ClosureType type, - float roughness) -{ -#ifdef __SPLIT_KERNEL__ - Ray ray_object = ss_isect->ray; - Ray *ray = &ray_object; -#else - Ray *ray = &ss_isect->ray; -#endif - - /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ -#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) - kernel_split_params.dummy_sd_flag = sd->flag; -#endif - - /* Setup new shading point. */ - shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); - - /* Optionally blur colors and bump mapping. */ - float3 weight = ss_isect->weight[hit]; - float3 N = sd->N; - subsurface_color_bump_blur(kg, sd, state, &weight, &N); - - /* Setup diffuse BSDF. */ - subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N); -} - -/* Random walk subsurface scattering. - * - * "Practical and Controllable Subsurface Scattering for Production Path - * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ - -ccl_device void subsurface_random_walk_remap(const float A, - const float d, - float *sigma_t, - float *alpha) -{ - /* Compute attenuation and scattering coefficients from albedo. */ - *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f))); - const float s = 1.9f - A + 3.5f * sqr(A - 0.8f); - - *sigma_t = 1.0f / fmaxf(d * s, 1e-16f); -} - -ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc, - float3 *sigma_t, - float3 *alpha, - float3 *weight) -{ - const Bssrdf *bssrdf = (const Bssrdf *)sc; - const float3 A = bssrdf->albedo; - const float3 d = bssrdf->radius; - float sigma_t_x, sigma_t_y, sigma_t_z; - float alpha_x, alpha_y, alpha_z; - - subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x); - subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y); - subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z); - - *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z); - *alpha = make_float3(alpha_x, alpha_y, alpha_z); - - /* Closure mixing and Fresnel weights separate from albedo. */ - *weight = safe_divide_color(bssrdf->weight, A); -} - -/* References for Dwivedi sampling: - * - * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering" - * by Jaroslav KÅ™ivánek and Eugene d'Eon (SIGGRAPH 2014) - * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/ - * - * [2] "Improving the Dwivedi Sampling Scheme" - * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016) - * https://cg.ivd.kit.edu/1951.php - * - * [3] "Zero-Variance Theory for Efficient Subsurface Scattering" - * by Eugene d'Eon and Jaroslav KÅ™ivánek (SIGGRAPH 2020) - * https://iliyan.com/publications/RenderingCourse2020 - */ - -ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta) -{ - /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */ - return 1.0f / ((v - cos_theta) * phase_log); -} - -ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand) -{ - /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)` - * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation, - * we can implement the power function like this. */ - return v - (v + 1) * expf(-rand * phase_log); -} - -ccl_device_forceinline float diffusion_length_dwivedi(float alpha) -{ - /* Eq. 67 from [3] */ - return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha)); -} - -ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv) -{ - float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta); - float phi = M_2PI_F * randv; - float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta); - - float3 T, B; - make_orthonormals(D, &T, &B); - return dir.x * T + dir.y * B + dir.z * D; -} - -ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t, - float t, - bool hit, - float3 *transmittance) -{ - float3 T = volume_color_transmittance(sigma_t, t); - if (transmittance) { - *transmittance = T; - } - return hit ? T : sigma_t * T; -} - -#ifdef __KERNEL_OPTIX__ -ccl_device_inline /* inline trace calls */ -#else -ccl_device_noinline -#endif - bool - subsurface_random_walk(KernelGlobals *kg, - LocalIntersection *ss_isect, - ShaderData *sd, - ccl_addr_space PathState *state, - const ShaderClosure *sc, - const float bssrdf_u, - const float bssrdf_v, - bool all) -{ - /* Sample diffuse surface scatter into the object. */ - float3 D; - float pdf; - sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf); - if (dot(-sd->Ng, D) <= 0.0f) { - return 0; - } - - /* Convert subsurface to volume coefficients. - * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */ - float3 sigma_t, alpha; - float3 throughput = one_float3(); - subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput); - float3 sigma_s = sigma_t * alpha; - - /* Theoretically it should be better to use the exact alpha for the channel we're sampling at - * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange - * for making the code significantly more complex and slower (if direction sampling depends on - * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on). - * - * Since the strength of the guided sampling increases as alpha gets lower, using a value that - * is too low results in fireflies while one that's too high just gives a bit more noise. - * Therefore, the code here uses the highest of the three albedos to be safe. */ - float diffusion_length = diffusion_length_dwivedi(max3(alpha)); - /* Precompute term for phase sampling. */ - float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1)); - - /* Setup ray. */ -#ifdef __SPLIT_KERNEL__ - Ray ray_object = ss_isect->ray; - Ray *ray = &ray_object; -#else - Ray *ray = &ss_isect->ray; -#endif - ray->P = ray_offset(sd->P, -sd->Ng); - ray->D = D; - ray->t = FLT_MAX; - ray->time = sd->time; - - /* Modify state for RNGs, decorrelated from other paths. */ - uint prev_rng_offset = state->rng_offset; - uint prev_rng_hash = state->rng_hash; - state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef); - - /* Random walk until we hit the surface again. */ - bool hit = false; - bool have_opposite_interface = false; - float opposite_distance = 0.0f; - - /* Todo: Disable for alpha>0.999 or so? */ - const float guided_fraction = 0.75f; - - for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) { - /* Advance random number offset. */ - state->rng_offset += PRNG_BOUNCE_NUM; - - /* Sample color channel, use MIS with balance heuristic. */ - float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); - float3 channel_pdf; - int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf); - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); - float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); - - /* We need the result of the raycast to compute the full guided PDF, so just remember the - * relevant terms to avoid recomputing them later. */ - float backward_fraction = 0.0f; - float forward_pdf_factor = 0.0f; - float forward_stretching = 1.0f; - float backward_pdf_factor = 0.0f; - float backward_stretching = 1.0f; - - /* For the initial ray, we already know the direction, so just do classic distance sampling. */ - if (bounce > 0) { - /* Decide whether we should use guided or classic sampling. */ - bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction); - - /* Determine if we want to sample away from the incoming interface. - * This only happens if we found a nearby opposite interface, and the probability for it - * depends on how close we are to it already. - * This probability term comes from the recorded presentation of [3]. */ - bool guide_backward = false; - if (have_opposite_interface) { - /* Compute distance of the random walk between the tangent plane at the starting point - * and the assumed opposite interface (the parallel plane that contains the point we - * found in our ray query for the opposite side). */ - float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance); - backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length)); - guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction; - } - - /* Sample scattering direction. */ - float scatter_u, scatter_v; - path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v); - float cos_theta; - if (guided) { - cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u); - /* The backwards guiding distribution is just mirrored along sd->N, so swapping the - * sign here is enough to sample from that instead. */ - if (guide_backward) { - cos_theta = -cos_theta; - } - } - else { - cos_theta = 2.0f * scatter_u - 1.0f; - } - ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v); - - /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic). - * Since phase sampling is channel-independent, we can get away with applying a factor - * to the guided PDF, which implicitly means pulling out the classic PDF term and letting - * it cancel with an equivalent term in the numerator of the full estimator. - * For the backward PDF, we again reuse the same probability distribution with a sign swap. - */ - forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta); - backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta); - - /* Prepare distance sampling. - * For the backwards case, this also needs the sign swapped since now directions against - * sd->N (and therefore with negative cos_theta) are preferred. */ - forward_stretching = (1.0f - cos_theta / diffusion_length); - backward_stretching = (1.0f + cos_theta / diffusion_length); - if (guided) { - sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching; - } - } - - /* Sample direction along ray. */ - float t = -logf(1.0f - randt) / sample_sigma_t; - - /* On the first bounce, we use the raycast to check if the opposite side is nearby. - * If yes, we will later use backwards guided sampling in order to have a decent - * chance of connecting to it. - * Todo: Maybe use less than 10 times the mean free path? */ - ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t; - scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1); - hit = (ss_isect->num_hits > 0); - - if (hit) { -#ifdef __KERNEL_OPTIX__ - /* t is always in world space with OptiX. */ - ray->t = ss_isect->hits[0].t; -#else - /* Compute world space distance to surface hit. */ - float3 D = ray->D; - object_inverse_dir_transform(kg, sd, &D); - D = normalize(D) * ss_isect->hits[0].t; - object_dir_transform(kg, sd, &D); - ray->t = len(D); -#endif - } - - if (bounce == 0) { - /* Check if we hit the opposite side. */ - if (hit) { - have_opposite_interface = true; - opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N); - } - /* Apart from the opposite side check, we were supposed to only trace up to distance t, - * so check if there would have been a hit in that case. */ - hit = ray->t < t; - } - - /* Use the distance to the exit point for the throughput update if we found one. */ - if (hit) { - t = ray->t; - } - else if (bounce == 0) { - /* Restore original position if nothing was hit after the first bounce, - * without the ray_offset() that was added to avoid self-intersection. - * Otherwise if that offset is relatively large compared to the scattering - * radius, we never go back up high enough to exit the surface. */ - ray->P = sd->P; - } - - /* Advance to new scatter location. */ - ray->P += t * ray->D; - - float3 transmittance; - float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance); - if (bounce > 0) { - /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */ - float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL); - - if (have_opposite_interface) { - /* First step of MIS: Depending on geometry we might have two methods for guided - * sampling, so perform MIS between them. */ - float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL); - guided_pdf = mix( - guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction); - } - else { - /* Just include phase sampling factor otherwise. */ - guided_pdf *= forward_pdf_factor; - } - - /* Now we apply the MIS balance heuristic between the classic and guided sampling. */ - pdf = mix(pdf, guided_pdf, guided_fraction); - } - - /* Finally, we're applying MIS again to combine the three color channels. - * Altogether, the MIS computation combines up to nine different estimators: - * {classic, guided, backward_guided} x {r, g, b} */ - throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf); - - if (hit) { - /* If we hit the surface, we are done. */ - break; - } - else if (throughput.x < VOLUME_THROUGHPUT_EPSILON && - throughput.y < VOLUME_THROUGHPUT_EPSILON && - throughput.z < VOLUME_THROUGHPUT_EPSILON) { - /* Avoid unnecessary work and precision issue when throughput gets really small. */ - break; - } - } - - kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) && - isfinite_safe(throughput.z)); - - state->rng_offset = prev_rng_offset; - state->rng_hash = prev_rng_hash; - - /* Return number of hits in ss_isect. */ - if (!hit) { - return 0; - } - - /* TODO: gain back performance lost from merging with disk BSSRDF. We - * only need to return on hit so this indirect ray push/pop overhead - * is not actually needed, but it does keep the code simpler. */ - ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all); -#ifdef __SPLIT_KERNEL__ - ss_isect->ray = *ray; -#endif - - return 1; -} - -ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg, - LocalIntersection *ss_isect, - ShaderData *sd, - ccl_addr_space PathState *state, - const ShaderClosure *sc, - uint *lcg_state, - float bssrdf_u, - float bssrdf_v, - bool all) -{ - if (CLOSURE_IS_DISK_BSSRDF(sc->type)) { - return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all); - } - else { - return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index c8e01677d09..bf9b94c1753 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders) KERNEL_TEX(float, __lookup_table) /* sobol */ -KERNEL_TEX(uint, __sample_pattern_lut) +KERNEL_TEX(float, __sample_pattern_lut) /* image textures */ KERNEL_TEX(TextureInfo, __texture_info) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 7cbe18acf28..927e60e8729 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_TYPES_H__ -#define __KERNEL_TYPES_H__ +#pragma once #if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE) # include <embree3/rtcore.h> @@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN #define PRIM_NONE (~0) #define LAMP_NONE (~0) #define ID_NONE (0.0f) +#define PASS_UNUSED (~0) -#define VOLUME_STACK_SIZE 32 - -/* Split kernel constants */ -#define WORK_POOL_SIZE_GPU 64 -#define WORK_POOL_SIZE_CPU 1 -#ifdef __KERNEL_GPU__ -# define WORK_POOL_SIZE WORK_POOL_SIZE_GPU -#else -# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU -#endif - -#define SHADER_SORT_BLOCK_SIZE 2048 - -#ifdef __KERNEL_OPENCL__ -# define SHADER_SORT_LOCAL_SIZE 64 -#elif defined(__KERNEL_CUDA__) -# define SHADER_SORT_LOCAL_SIZE 32 -#else -# define SHADER_SORT_LOCAL_SIZE 1 -#endif +#define VOLUME_STACK_SIZE 4 /* Kernel features */ #define __SOBOL__ @@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN #define __INTERSECTION_REFINE__ #define __CLAMP_SAMPLE__ #define __PATCH_EVAL__ -#define __SHADOW_TRICKS__ +#define __SHADOW_CATCHER__ #define __DENOISING_FEATURES__ #define __SHADER_RAYTRACE__ #define __AO__ @@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN #define __SVM__ #define __EMISSION__ #define __HOLDOUT__ -#define __MULTI_CLOSURE__ #define __TRANSPARENT_SHADOWS__ #define __BACKGROUND_MIS__ #define __LAMP_MIS__ @@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN #define __PRINCIPLED__ #define __SUBSURFACE__ #define __VOLUME__ -#define __VOLUME_SCATTER__ #define __CMJ__ #define __SHADOW_RECORD_ALL__ #define __BRANCHED_PATH__ @@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN # ifdef WITH_OSL # define __OSL__ # endif -# define __VOLUME_DECOUPLED__ # define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ -#ifdef __KERNEL_CUDA__ -# ifdef __SPLIT_KERNEL__ -# undef __BRANCHED_PATH__ -# endif -#endif /* __KERNEL_CUDA__ */ - #ifdef __KERNEL_OPTIX__ # undef __BAKING__ -# undef __BRANCHED_PATH__ #endif /* __KERNEL_OPTIX__ */ -#ifdef __KERNEL_OPENCL__ -#endif /* __KERNEL_OPENCL__ */ - /* Scene-based selective features compilation. */ -#ifdef __NO_CAMERA_MOTION__ -# undef __CAMERA_MOTION__ -#endif -#ifdef __NO_OBJECT_MOTION__ -# undef __OBJECT_MOTION__ -#endif -#ifdef __NO_HAIR__ -# undef __HAIR__ -#endif -#ifdef __NO_VOLUME__ -# undef __VOLUME__ -# undef __VOLUME_SCATTER__ -#endif -#ifdef __NO_SUBSURFACE__ -# undef __SUBSURFACE__ -#endif -#ifdef __NO_BAKING__ -# undef __BAKING__ -#endif -#ifdef __NO_BRANCHED_PATH__ -# undef __BRANCHED_PATH__ -#endif -#ifdef __NO_PATCH_EVAL__ -# undef __PATCH_EVAL__ -#endif -#ifdef __NO_TRANSPARENT__ -# undef __TRANSPARENT_SHADOWS__ -#endif -#ifdef __NO_SHADOW_TRICKS__ -# undef __SHADOW_TRICKS__ -#endif -#ifdef __NO_PRINCIPLED__ -# undef __PRINCIPLED__ -#endif -#ifdef __NO_DENOISING__ -# undef __DENOISING_FEATURES__ -#endif -#ifdef __NO_SHADER_RAYTRACE__ -# undef __SHADER_RAYTRACE__ +#ifdef __KERNEL_FEATURES__ +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION) +# undef __CAMERA_MOTION__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION) +# undef __OBJECT_MOTION__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR) +# undef __HAIR__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME) +# undef __VOLUME__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE) +# undef __SUBSURFACE__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING) +# undef __BAKING__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION) +# undef __PATCH_EVAL__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT) +# undef __TRANSPARENT_SHADOWS__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER) +# undef __SHADOW_CATCHER__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED) +# undef __PRINCIPLED__ +# endif +# if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING) +# undef __DENOISING_FEATURES__ +# endif #endif #ifdef WITH_CYCLES_DEBUG_NAN # define __KERNEL_DEBUG_NAN__ #endif +/* Features that enable others */ + #if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__) # define __BVH_LOCAL__ #endif -/* Shader Evaluation */ - -typedef enum ShaderEvalType { - SHADER_EVAL_DISPLACE, - SHADER_EVAL_BACKGROUND, - /* bake types */ - SHADER_EVAL_BAKE, /* no real shade, it's used in the code to - * differentiate the type of shader eval from the above - */ - /* data passes */ - SHADER_EVAL_NORMAL, - SHADER_EVAL_UV, - SHADER_EVAL_ROUGHNESS, - SHADER_EVAL_DIFFUSE_COLOR, - SHADER_EVAL_GLOSSY_COLOR, - SHADER_EVAL_TRANSMISSION_COLOR, - SHADER_EVAL_EMISSION, - SHADER_EVAL_AOV_COLOR, - SHADER_EVAL_AOV_VALUE, - - /* light passes */ - SHADER_EVAL_AO, - SHADER_EVAL_COMBINED, - SHADER_EVAL_SHADOW, - SHADER_EVAL_DIFFUSE, - SHADER_EVAL_GLOSSY, - SHADER_EVAL_TRANSMISSION, - - /* extra */ - SHADER_EVAL_ENVIRONMENT, -} ShaderEvalType; - /* Path Tracing * note we need to keep the u/v pairs at even values */ @@ -252,8 +185,7 @@ enum PathTraceDimension { enum SamplingPattern { SAMPLING_PATTERN_SOBOL = 0, - SAMPLING_PATTERN_CMJ = 1, - SAMPLING_PATTERN_PMJ = 2, + SAMPLING_PATTERN_PMJ = 1, SAMPLING_NUM_PATTERNS, }; @@ -261,7 +193,12 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - /* Ray visibility. */ + /* -------------------------------------------------------------------- + * Ray visibility. + * + * NOTE: Recalculated after a surface bounce. + */ + PATH_RAY_CAMERA = (1 << 0), PATH_RAY_REFLECT = (1 << 1), PATH_RAY_TRANSMIT = (1 << 2), @@ -269,57 +206,106 @@ enum PathRayFlag { PATH_RAY_GLOSSY = (1 << 4), PATH_RAY_SINGULAR = (1 << 5), PATH_RAY_TRANSPARENT = (1 << 6), + PATH_RAY_VOLUME_SCATTER = (1 << 7), /* Shadow ray visibility. */ - PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7), - PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8), - PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER), - PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9), - PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10), - PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER | - PATH_RAY_SHADOW_TRANSPARENT_CATCHER), - PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | - PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER), + PATH_RAY_SHADOW_OPAQUE = (1 << 8), + PATH_RAY_SHADOW_TRANSPARENT = (1 << 9), PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT), - /* Unused, free to reuse. */ - PATH_RAY_UNUSED = (1 << 11), + /* Special flag to tag unaligned BVH nodes. + * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored + * in the node (either it should be intersected as AABB or as OBB). */ + PATH_RAY_NODE_UNALIGNED = (1 << 10), - /* Ray visibility for volume scattering. */ - PATH_RAY_VOLUME_SCATTER = (1 << 12), - - /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = (1 << 13), + /* Subset of flags used for ray visibility for intersection. + * + * NOTE: SHADOW_CATCHER macros below assume there are no more than + * 16 visibility bits. */ + PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1), - PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1), + /* -------------------------------------------------------------------- + * Path flags. + */ /* Don't apply multiple importance sampling weights to emission from * lamp or surface hits, because they were not direct light sampled. */ - PATH_RAY_MIS_SKIP = (1 << 14), + PATH_RAY_MIS_SKIP = (1 << 11), + /* Diffuse bounce earlier in the path, skip SSS to improve performance * and avoid branching twice with disk sampling SSS. */ - PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12), + /* Single pass has been written. */ - PATH_RAY_SINGLE_PASS_DONE = (1 << 16), - /* Ray is behind a shadow catcher. */ - PATH_RAY_SHADOW_CATCHER = (1 << 17), - /* Store shadow data for shadow catcher or denoising. */ - PATH_RAY_STORE_SHADOW_INFO = (1 << 18), + PATH_RAY_SINGLE_PASS_DONE = (1 << 13), + /* Zero background alpha, for camera or transparent glass rays. */ - PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19), + PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14), + /* Terminate ray immediately at next bounce. */ - PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20), + PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15), + PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16), + /* Ray is to be terminated, but continue with transparent bounces and * emission as long as we encounter them. This is required to make the * MIS between direct and indirect light rays match, as shadow rays go * through transparent surfaces to reach emission too. */ - PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21), + PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17), + + /* Terminate ray immediately after volume shading. */ + PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18), + /* Ray is to be terminated. */ - PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT), + PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME | + PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME), + /* Path and shader is being evaluated for direct lighting emission. */ - PATH_RAY_EMISSION = (1 << 22) + PATH_RAY_EMISSION = (1 << 19), + + /* Perform subsurface scattering. */ + PATH_RAY_SUBSURFACE = (1 << 20), + + /* Contribute to denoising features. */ + PATH_RAY_DENOISING_FEATURES = (1 << 21), + + /* Render pass categories. */ + PATH_RAY_REFLECT_PASS = (1 << 22), + PATH_RAY_TRANSMISSION_PASS = (1 << 23), + PATH_RAY_VOLUME_PASS = (1 << 24), + PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS), + + /* Shadow ray is for a light or surface. */ + PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25), + + /* A shadow catcher object was hit and the path was split into two. */ + PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26), + + /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into + * their dedicated pass for later division. + * + * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling + * which is separate from the light passes. */ + PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27), + + /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */ + PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28), }; +/* Configure ray visibility bits for rays and objects respectively, + * to make shadow catchers work. + * + * On shadow catcher paths we want to ignore any intersections with non-catchers, + * whereas on regular paths we want to intersect all objects. */ + +#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16) + +#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \ + (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \ + (visibility)) + +#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \ + (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility)) + /* Closure Label */ typedef enum ClosureLabel { @@ -332,6 +318,7 @@ typedef enum ClosureLabel { LABEL_TRANSPARENT = 32, LABEL_VOLUME_SCATTER = 64, LABEL_TRANSMIT_TRANSPARENT = 128, + LABEL_SUBSURFACE_SCATTER = 256, } ClosureLabel; /* Render Passes */ @@ -339,17 +326,35 @@ typedef enum ClosureLabel { #define PASS_NAME_JOIN(a, b) a##_##b #define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32)) -#define PASSMASK_COMPONENT(comp) \ - (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \ - PASSMASK(PASS_NAME_JOIN(comp, COLOR))) - +// NOTE: Keep in sync with `Pass::get_type_enum()`. typedef enum PassType { PASS_NONE = 0, - /* Main passes */ + /* Light Passes */ PASS_COMBINED = 1, - PASS_DEPTH, + PASS_EMISSION, + PASS_BACKGROUND, + PASS_AO, + PASS_SHADOW, + PASS_DIFFUSE, + PASS_DIFFUSE_DIRECT, + PASS_DIFFUSE_INDIRECT, + PASS_GLOSSY, + PASS_GLOSSY_DIRECT, + PASS_GLOSSY_INDIRECT, + PASS_TRANSMISSION, + PASS_TRANSMISSION_DIRECT, + PASS_TRANSMISSION_INDIRECT, + PASS_VOLUME, + PASS_VOLUME_DIRECT, + PASS_VOLUME_INDIRECT, + PASS_CATEGORY_LIGHT_END = 31, + + /* Data passes */ + PASS_DEPTH = 32, + PASS_POSITION, PASS_NORMAL, + PASS_ROUGHNESS, PASS_UV, PASS_OBJECT_ID, PASS_MATERIAL_ID, @@ -361,31 +366,35 @@ typedef enum PassType { PASS_AOV_VALUE, PASS_ADAPTIVE_AUX_BUFFER, PASS_SAMPLE_COUNT, - PASS_CATEGORY_MAIN_END = 31, - - PASS_MIST = 32, - PASS_EMISSION, - PASS_BACKGROUND, - PASS_AO, - PASS_SHADOW, - PASS_LIGHT, /* no real pass, used to force use_light_pass */ - PASS_DIFFUSE_DIRECT, - PASS_DIFFUSE_INDIRECT, PASS_DIFFUSE_COLOR, - PASS_GLOSSY_DIRECT, - PASS_GLOSSY_INDIRECT, PASS_GLOSSY_COLOR, - PASS_TRANSMISSION_DIRECT, - PASS_TRANSMISSION_INDIRECT, PASS_TRANSMISSION_COLOR, - PASS_VOLUME_DIRECT = 50, - PASS_VOLUME_INDIRECT, /* No Scatter color since it's tricky to define what it would even mean. */ - PASS_CATEGORY_LIGHT_END = 63, + PASS_MIST, + PASS_DENOISING_NORMAL, + PASS_DENOISING_ALBEDO, + + /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by + * any other object. The pass accessor will divide the combined pass by the shadow catcher. The + * result of this division is then to be multiplied with the backdrop. The alpha channel of this + * pass contains number of samples which contributed to the color components of the pass. + * + * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split + * happenned. + * + * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be + * alpha-overed onto the backdrop (after multiplication). */ + PASS_SHADOW_CATCHER, + PASS_SHADOW_CATCHER_SAMPLE_COUNT, + PASS_SHADOW_CATCHER_MATTE, + + PASS_CATEGORY_DATA_END = 63, PASS_BAKE_PRIMITIVE, PASS_BAKE_DIFFERENTIAL, - PASS_CATEGORY_BAKE_END = 95 + PASS_CATEGORY_BAKE_END = 95, + + PASS_NUM, } PassType; #define PASS_ANY (~0) @@ -398,158 +407,9 @@ typedef enum CryptomatteType { CRYPT_ACCURATE = (1 << 3), } CryptomatteType; -typedef enum DenoisingPassOffsets { - DENOISING_PASS_NORMAL = 0, - DENOISING_PASS_NORMAL_VAR = 3, - DENOISING_PASS_ALBEDO = 6, - DENOISING_PASS_ALBEDO_VAR = 9, - DENOISING_PASS_DEPTH = 12, - DENOISING_PASS_DEPTH_VAR = 13, - DENOISING_PASS_SHADOW_A = 14, - DENOISING_PASS_SHADOW_B = 17, - DENOISING_PASS_COLOR = 20, - DENOISING_PASS_COLOR_VAR = 23, - DENOISING_PASS_CLEAN = 26, - - DENOISING_PASS_PREFILTERED_DEPTH = 0, - DENOISING_PASS_PREFILTERED_NORMAL = 1, - DENOISING_PASS_PREFILTERED_SHADOWING = 4, - DENOISING_PASS_PREFILTERED_ALBEDO = 5, - DENOISING_PASS_PREFILTERED_COLOR = 8, - DENOISING_PASS_PREFILTERED_VARIANCE = 11, - DENOISING_PASS_PREFILTERED_INTENSITY = 14, - - DENOISING_PASS_SIZE_BASE = 26, - DENOISING_PASS_SIZE_CLEAN = 3, - DENOISING_PASS_SIZE_PREFILTERED = 15, -} DenoisingPassOffsets; - -typedef enum eBakePassFilter { - BAKE_FILTER_NONE = 0, - BAKE_FILTER_DIRECT = (1 << 0), - BAKE_FILTER_INDIRECT = (1 << 1), - BAKE_FILTER_COLOR = (1 << 2), - BAKE_FILTER_DIFFUSE = (1 << 3), - BAKE_FILTER_GLOSSY = (1 << 4), - BAKE_FILTER_TRANSMISSION = (1 << 5), - BAKE_FILTER_EMISSION = (1 << 6), - BAKE_FILTER_AO = (1 << 7), -} eBakePassFilter; - -typedef enum BakePassFilterCombos { - BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE | - BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION | - BAKE_FILTER_AO), - BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE), - BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY), - BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION), - BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE), - BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY), - BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION), -} BakePassFilterCombos; - -typedef enum DenoiseFlag { - DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), - DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), - DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), - DENOISING_CLEAN_GLOSSY_IND = (1 << 3), - DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), - DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), - DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1, -} DenoiseFlag; - -typedef ccl_addr_space struct PathRadianceState { -#ifdef __PASSES__ - float3 diffuse; - float3 glossy; - float3 transmission; - float3 volume; - - float3 direct; -#endif -} PathRadianceState; - -typedef ccl_addr_space struct PathRadiance { -#ifdef __PASSES__ - int use_light_pass; -#endif - - float transparent; - float3 emission; -#ifdef __PASSES__ - float3 background; - float3 ao; - - float3 indirect; - float3 direct_emission; - - float3 color_diffuse; - float3 color_glossy; - float3 color_transmission; - - float3 direct_diffuse; - float3 direct_glossy; - float3 direct_transmission; - float3 direct_volume; - - float3 indirect_diffuse; - float3 indirect_glossy; - float3 indirect_transmission; - float3 indirect_volume; - - float3 shadow; - float mist; -#endif - - struct PathRadianceState state; - -#ifdef __SHADOW_TRICKS__ - /* Total light reachable across the path, ignoring shadow blocked queries. */ - float3 path_total; - /* Total light reachable across the path with shadow blocked queries - * applied here. - * - * Dividing this figure by path_total will give estimate of shadow pass. - */ - float3 path_total_shaded; - - /* Color of the background on which shadow is alpha-overed. */ - float3 shadow_background_color; - - /* Path radiance sum and throughput at the moment when ray hits shadow - * catcher object. - */ - float shadow_throughput; - - /* Accumulated transparency along the path after shadow catcher bounce. */ - float shadow_transparency; - - /* Indicate if any shadow catcher data is set. */ - int has_shadow_catcher; -#endif - -#ifdef __DENOISING_FEATURES__ - float3 denoising_normal; - float3 denoising_albedo; - float denoising_depth; -#endif /* __DENOISING_FEATURES__ */ -} PathRadiance; - typedef struct BsdfEval { -#ifdef __PASSES__ - int use_light_pass; -#endif - float3 diffuse; -#ifdef __PASSES__ float3 glossy; - float3 transmission; - float3 transparent; - float3 volume; -#endif -#ifdef __SHADOW_TRICKS__ - float3 sum_no_mis; -#endif } BsdfEval; /* Shader Flag */ @@ -564,8 +424,10 @@ typedef enum ShaderFlag { SHADER_EXCLUDE_TRANSMIT = (1 << 25), SHADER_EXCLUDE_CAMERA = (1 << 24), SHADER_EXCLUDE_SCATTER = (1 << 23), + SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22), SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT | - SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER), + SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER | + SHADER_EXCLUDE_SHADOW_CATCHER), SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS | SHADER_EXCLUDE_ANY) @@ -612,29 +474,14 @@ typedef struct differential { /* Ray */ typedef struct Ray { -/* TODO(sergey): This is only needed because current AMD - * compiler has hard time building the kernel with this - * reshuffle. And at the same time reshuffle will cause - * less optimal CPU code in certain places. - * - * We'll get rid of this nasty exception once AMD compiler - * is fixed. - */ -#ifndef __KERNEL_OPENCL_AMD__ float3 P; /* origin */ float3 D; /* direction */ float t; /* length of the ray */ float time; /* time (for motion blur) */ -#else - float t; /* length of the ray */ - float time; /* time (for motion blur) */ - float3 P; /* origin */ - float3 D; /* direction */ -#endif #ifdef __RAY_DIFFERENTIALS__ - differential3 dP; - differential3 dD; + float dP; + float dD; #endif } Ray; @@ -661,9 +508,6 @@ typedef enum PrimitiveType { PRIMITIVE_CURVE_RIBBON = (1 << 4), PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5), PRIMITIVE_VOLUME = (1 << 6), - /* Lamp primitive is not included below on purpose, - * since it is no real traceable primitive. - */ PRIMITIVE_LAMP = (1 << 7), PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE), @@ -672,16 +516,14 @@ typedef enum PrimitiveType { PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME), PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK | PRIMITIVE_MOTION_CURVE_RIBBON), - PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME), + PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME | + PRIMITIVE_LAMP), - /* Total number of different traceable primitives. - * NOTE: This is an actual value, not a bitflag. - */ - PRIMITIVE_NUM_TOTAL = 7, + PRIMITIVE_NUM = 8, } PrimitiveType; -#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type)) -#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL) +#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type)) +#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM) typedef enum CurveShapeType { CURVE_RIBBON = 0, @@ -760,20 +602,14 @@ typedef struct AttributeDescriptor { /* Closure data */ -#ifdef __MULTI_CLOSURE__ -# ifdef __SPLIT_KERNEL__ -# define MAX_CLOSURE 1 -# else -# ifndef __MAX_CLOSURE__ -# define MAX_CLOSURE 64 -# else -# define MAX_CLOSURE __MAX_CLOSURE__ -# endif -# endif +#ifndef __MAX_CLOSURE__ +# define MAX_CLOSURE 64 #else -# define MAX_CLOSURE 1 +# define MAX_CLOSURE __MAX_CLOSURE__ #endif +#define MAX_VOLUME_CLOSURE 8 + /* This struct is the base class for all closures. The common members are * duplicated in all derived classes since we don't have C++ in the kernel * yet, and because it lets us lay out the members to minimize padding. The @@ -866,11 +702,14 @@ enum ShaderDataFlag { SD_NEED_VOLUME_ATTRIBUTES = (1 << 28), /* Shader has emission */ SD_HAS_EMISSION = (1 << 29), + /* Shader has raytracing */ + SD_HAS_RAYTRACE = (1 << 30), SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME | SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR | SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT | - SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES) + SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION | + SD_HAS_RAYTRACE) }; /* Object flags. */ @@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData #endif #ifdef __OBJECT_MOTION__ - /* object <-> world space transformations, cached to avoid - * re-interpolating them constantly for shading */ - Transform ob_tfm; - Transform ob_itfm; + /* Object <-> world space transformations for motion blur, cached to avoid + * re-interpolating them constantly for shading. */ + Transform ob_tfm_motion; + Transform ob_itfm_motion; #endif /* ray start position, only set for backgrounds */ float3 ray_P; - differential3 ray_dP; + float ray_dP; #ifdef __OSL__ - struct KernelGlobals *osl_globals; - struct PathState *osl_path_state; + const struct KernelGlobals *osl_globals; + const struct IntegratorStateCPU *osl_path_state; #endif /* LCG state for closures that require additional random numbers. */ @@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData /* Closure data, we store a fixed array of closures */ int num_closure; int num_closure_left; - float randb_closure; float3 svm_closure_weight; /* Closure weights summed directly, so we can evaluate @@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage ShaderDataTinyStorage; #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage) -/* Path State */ +/* Compact volume closures storage. + * + * Used for decoupled direct/indirect light closure storage. */ + +ccl_addr_space struct ShaderVolumeClosure { + float3 weight; + float sample_weight; + float g; +}; + +ccl_addr_space struct ShaderVolumePhases { + ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE]; + int num_closure; +}; + +/* Volume Stack */ #ifdef __VOLUME__ typedef struct VolumeStack { @@ -1007,53 +860,6 @@ typedef struct VolumeStack { } VolumeStack; #endif -typedef struct PathState { - /* see enum PathRayFlag */ - int flag; - - /* random number generator state */ - uint rng_hash; /* per pixel hash */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ - float branch_factor; /* number of branches in indirect paths */ - - /* bounce counting */ - int bounce; - int diffuse_bounce; - int glossy_bounce; - int transmission_bounce; - int transparent_bounce; - -#ifdef __DENOISING_FEATURES__ - float denoising_feature_weight; - float3 denoising_feature_throughput; -#endif /* __DENOISING_FEATURES__ */ - - /* multiple importance sampling */ - float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ - float ray_pdf; /* last bounce pdf */ -#ifdef __LAMP_MIS__ - float ray_t; /* accumulated distance through transparent surfaces */ -#endif - - /* volume rendering */ -#ifdef __VOLUME__ - int volume_bounce; - int volume_bounds_bounce; - VolumeStack volume_stack[VOLUME_STACK_SIZE]; -#endif -} PathState; - -#ifdef __VOLUME__ -typedef struct VolumeState { -# ifdef __SPLIT_KERNEL__ -# else - PathState ps; -# endif -} VolumeState; -#endif - /* Struct to gather multiple nearby intersections. */ typedef struct LocalIntersection { Ray ray; @@ -1064,20 +870,6 @@ typedef struct LocalIntersection { float3 Ng[LOCAL_MAX_HITS]; } LocalIntersection; -/* Subsurface */ - -/* Struct to gather SSS indirect rays and delay tracing them. */ -typedef struct SubsurfaceIndirectRays { - PathState state[BSSRDF_MAX_HITS]; - - int num_rays; - - struct Ray rays[BSSRDF_MAX_HITS]; - float3 throughputs[BSSRDF_MAX_HITS]; - struct PathRadianceState L_state[BSSRDF_MAX_HITS]; -} SubsurfaceIndirectRays; -static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high."); - /* Constant Kernel Data * * These structs are passed from CPU to various devices, and the struct layout @@ -1128,7 +920,7 @@ typedef struct KernelCamera { /* render size */ float width, height; - int resolution; + int pad1; /* anamorphic lens bokeh */ float inv_aperture_ratio; @@ -1169,11 +961,12 @@ typedef struct KernelFilm { int light_pass_flag; int pass_stride; - int use_light_pass; int pass_combined; int pass_depth; + int pass_position; int pass_normal; + int pass_roughness; int pass_motion; int pass_motion_weight; @@ -1202,7 +995,13 @@ typedef struct KernelFilm { int pass_shadow; float pass_shadow_scale; + + int pass_shadow_catcher; + int pass_shadow_catcher_sample_count; + int pass_shadow_catcher_matte; + int filter_table_offset; + int cryptomatte_passes; int cryptomatte_depth; int pass_cryptomatte; @@ -1215,15 +1014,11 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; - int pass_denoising_data; - int pass_denoising_clean; - int denoising_flags; + int pass_denoising_normal; + int pass_denoising_albedo; int pass_aov_color; int pass_aov_value; - int pass_aov_color_num; - int pass_aov_value_num; - int pad1, pad2, pad3; /* XYZ to rendering color space transform. float4 instead of float3 to * ensure consistent padding/alignment across devices. */ @@ -1234,19 +1029,54 @@ typedef struct KernelFilm { int pass_bake_primitive; int pass_bake_differential; - int pad; - /* viewport rendering options */ - int display_pass_stride; - int display_pass_components; - int display_divide_pass_stride; - int use_display_exposure; - int use_display_pass_alpha; + int use_approximate_shadow_catcher; - int pad4, pad5, pad6; + int pad1, pad2, pad3; } KernelFilm; static_assert_align(KernelFilm, 16); +typedef struct KernelFilmConvert { + int pass_offset; + int pass_stride; + + int pass_use_exposure; + int pass_use_filter; + + int pass_divide; + int pass_indirect; + + int pass_combined; + int pass_sample_count; + int pass_adaptive_aux_buffer; + int pass_motion_weight; + int pass_shadow_catcher; + int pass_shadow_catcher_sample_count; + int pass_shadow_catcher_matte; + int pass_background; + + float scale; + float exposure; + float scale_exposure; + + int use_approximate_shadow_catcher; + int use_approximate_shadow_catcher_background; + int show_active_pixels; + + /* Number of components to write to. */ + int num_components; + + /* Number of floats per pixel. When zero is the same as `num_components`. + * NOTE: Is ignored for half4 destination. */ + int pixel_stride; + + int is_denoised; + + /* Padding. */ + int pad1; +} KernelFilmConvert; +static_assert_align(KernelFilmConvert, 16); + typedef struct KernelBackground { /* only shader index */ int surface_shader; @@ -1255,11 +1085,6 @@ typedef struct KernelBackground { int transparent; float transparent_roughness_squared_threshold; - /* ambient occlusion */ - float ao_factor; - float ao_distance; - float ao_bounces_factor; - /* portal sampling */ float portal_weight; int num_portals; @@ -1277,13 +1102,15 @@ typedef struct KernelBackground { int map_res_y; int use_mis; + + /* Padding */ + int pad1, pad2, pad3; } KernelBackground; static_assert_align(KernelBackground, 16); typedef struct KernelIntegrator { /* emission */ int use_direct_light; - int use_ambient_occlusion; int num_distribution; int num_all_lights; float pdf_triangles; @@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator { int max_transmission_bounce; int max_volume_bounce; + /* AO bounces */ int ao_bounces; + float ao_bounces_distance; + float ao_bounces_factor; /* transparent */ int transparent_min_bounce; @@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator { float sample_clamp_direct; float sample_clamp_indirect; - /* branched path */ - int branched; - int volume_decoupled; - int diffuse_samples; - int glossy_samples; - int transmission_samples; - int ao_samples; - int mesh_light_samples; - int subsurface_samples; - int sample_all_lights_direct; - int sample_all_lights_indirect; - /* mis */ int use_lamp_mis; /* sampler */ int sampling_pattern; - int aa_samples; - int adaptive_min_samples; - int adaptive_step; - int adaptive_stop_per_sample; - float adaptive_threshold; /* volume render */ int use_volumes; int volume_max_steps; float volume_step_rate; - int volume_samples; - - int start_sample; - int max_closures; + int has_shadow_catcher; + /* padding */ int pad1, pad2; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1401,14 +1212,19 @@ typedef struct KernelTables { static_assert_align(KernelTables, 16); typedef struct KernelBake { + int use; int object_index; int tri_offset; - int type; - int pass_filter; + int pad1; } KernelBake; static_assert_align(KernelBake, 16); typedef struct KernelData { + uint kernel_features; + uint max_closures; + uint max_shaders; + uint pad; + KernelCamera cam; KernelFilm film; KernelBackground background; @@ -1485,11 +1301,10 @@ typedef struct KernelLight { int type; float co[3]; int shader_id; - int samples; float max_bounces; float random; float strength[3]; - float pad1; + float pad1, pad2; Transform tfm; Transform itfm; union { @@ -1539,110 +1354,6 @@ typedef struct KernelShader { } KernelShader; static_assert_align(KernelShader, 16); -/* Declarations required for split kernel */ - -/* Macro for queues */ -/* Value marking queue's empty slot */ -#define QUEUE_EMPTY_SLOT -1 - -/* - * Queue 1 - Active rays - * Queue 2 - Background queue - * Queue 3 - Shadow ray cast kernel - AO - * Queue 4 - Shadow ray cast kernel - direct lighting - */ - -/* Queue names */ -enum QueueNumber { - /* All active rays and regenerated rays are enqueued here. */ - QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0, - - /* All - * 1. Background-hit rays, - * 2. Rays that has exited path-iteration but needs to update output buffer - * 3. Rays to be regenerated - * are enqueued here. - */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - - /* All rays for which a shadow ray should be cast to determine radiance - * contribution for AO are enqueued here. - */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - - /* All rays for which a shadow ray should be cast to determine radiance - * contributing for direct lighting are enqueued here. - */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - - /* Rays sorted according to shader->id */ - QUEUE_SHADER_SORTED_RAYS, - -#ifdef __BRANCHED_PATH__ - /* All rays moving to next iteration of the indirect loop for light */ - QUEUE_LIGHT_INDIRECT_ITER, - /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ - QUEUE_INACTIVE_RAYS, -# ifdef __VOLUME__ - /* All rays moving to next iteration of the indirect loop for volumes */ - QUEUE_VOLUME_INDIRECT_ITER, -# endif -# ifdef __SUBSURFACE__ - /* All rays moving to next iteration of the indirect loop for subsurface */ - QUEUE_SUBSURFACE_INDIRECT_ITER, -# endif -#endif /* __BRANCHED_PATH__ */ - - NUM_QUEUES -}; - -/* We use RAY_STATE_MASK to get ray_state */ -#define RAY_STATE_MASK 0x0F -#define RAY_FLAG_MASK 0xF0 -enum RayState { - RAY_INVALID = 0, - /* Denotes ray is actively involved in path-iteration. */ - RAY_ACTIVE, - /* Denotes ray has completed processing all samples and is inactive. */ - RAY_INACTIVE, - /* Denotes ray has exited path-iteration and needs to update output buffer. */ - RAY_UPDATE_BUFFER, - /* Denotes ray needs to skip most surface shader work. */ - RAY_HAS_ONLY_VOLUME, - /* Denotes ray has hit background */ - RAY_HIT_BACKGROUND, - /* Denotes ray has to be regenerated */ - RAY_TO_REGENERATE, - /* Denotes ray has been regenerated */ - RAY_REGENERATED, - /* Denotes ray is moving to next iteration of the branched indirect loop */ - RAY_LIGHT_INDIRECT_NEXT_ITER, - RAY_VOLUME_INDIRECT_NEXT_ITER, - RAY_SUBSURFACE_INDIRECT_NEXT_ITER, - - /* Ray flags */ - - /* Flags to denote that the ray is currently evaluating the branched indirect loop */ - RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), - RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), - RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), - RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | - RAY_BRANCHED_SUBSURFACE_INDIRECT), - - /* Ray is evaluating an iteration of an indirect loop for another thread */ - RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), -}; - -#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \ - (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) \ - ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) -#define ADD_RAY_FLAG(ray_state, ray_index, flag) \ - (ray_state[ray_index] = (ray_state[ray_index] | flag)) -#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \ - (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) -#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) - /* Patches */ #define PATCH_MAX_CONTROL_VERTS 16 @@ -1655,7 +1366,7 @@ enum RayState { /* Work Tiles */ -typedef struct WorkTile { +typedef struct KernelWorkTile { uint x, y, w, h; uint start_sample; @@ -1664,13 +1375,172 @@ typedef struct WorkTile { int offset; uint stride; - ccl_global float *buffer; -} WorkTile; + /* Precalculated parameters used by init_from_camera kernel on GPU. */ + int path_index_offset; + int work_size; +} KernelWorkTile; + +/* Shader Evaluation. + * + * Position on a primitive on an object at which we want to evaluate the + * shader for e.g. mesh displacement or light importance map. */ + +typedef struct KernelShaderEvalInput { + int object; + int prim; + float u, v; +} KernelShaderEvalInput; +static_assert_align(KernelShaderEvalInput, 16); /* Pre-computed sample table sizes for PMJ02 sampler. */ -#define NUM_PMJ_SAMPLES (64 * 64) -#define NUM_PMJ_PATTERNS 48 +#define NUM_PMJ_DIVISIONS 32 +#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS)) +#define NUM_PMJ_PATTERNS 1 -CCL_NAMESPACE_END +/* Device kernels. + * + * Identifier for kernels that can be executed in device queues. + * + * Some implementation details. + * + * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified. + * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */ + +typedef enum DeviceKernel { + DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0, + DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK, + DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND, + DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, + DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW, + DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL, + + DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, + DEVICE_KERNEL_INTEGRATOR_RESET, + DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, + + DEVICE_KERNEL_SHADER_EVAL_DISPLACE, + DEVICE_KERNEL_SHADER_EVAL_BACKGROUND, + +#define DECLARE_FILM_CONVERT_KERNEL(variant) \ + DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA + + DECLARE_FILM_CONVERT_KERNEL(DEPTH), + DECLARE_FILM_CONVERT_KERNEL(MIST), + DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT), + DECLARE_FILM_CONVERT_KERNEL(FLOAT), + DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH), + DECLARE_FILM_CONVERT_KERNEL(FLOAT3), + DECLARE_FILM_CONVERT_KERNEL(MOTION), + DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE), + DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER), + DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW), + DECLARE_FILM_CONVERT_KERNEL(COMBINED), + DECLARE_FILM_CONVERT_KERNEL(FLOAT4), + +#undef DECLARE_FILM_CONVERT_KERNEL + + DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, + DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, + DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, + + DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, + DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, + DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, + DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, + + DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, + + DEVICE_KERNEL_PREFIX_SUM, + + DEVICE_KERNEL_NUM, +} DeviceKernel; + +enum { + DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1, +}; + +/* Kernel Features */ + +enum KernelFeatureFlag : unsigned int { + /* Shader nodes. */ + KERNEL_FEATURE_NODE_BSDF = (1U << 0U), + KERNEL_FEATURE_NODE_EMISSION = (1U << 1U), + KERNEL_FEATURE_NODE_VOLUME = (1U << 2U), + KERNEL_FEATURE_NODE_HAIR = (1U << 3U), + KERNEL_FEATURE_NODE_BUMP = (1U << 4U), + KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U), + KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U), + KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U), + + /* Use denoising kernels and output denoising passes. */ + KERNEL_FEATURE_DENOISING = (1U << 8U), + + /* Use path tracing kernels. */ + KERNEL_FEATURE_PATH_TRACING = (1U << 9U), -#endif /* __KERNEL_TYPES_H__ */ + /* BVH/sampling kernel features. */ + KERNEL_FEATURE_HAIR = (1U << 10U), + KERNEL_FEATURE_HAIR_THICK = (1U << 11U), + KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U), + KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U), + + /* Denotes whether baking functionality is needed. */ + KERNEL_FEATURE_BAKING = (1U << 14U), + + /* Use subsurface scattering materials. */ + KERNEL_FEATURE_SUBSURFACE = (1U << 15U), + + /* Use volume materials. */ + KERNEL_FEATURE_VOLUME = (1U << 16U), + + /* Use OpenSubdiv patch evaluation */ + KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U), + + /* Use Transparent shadows */ + KERNEL_FEATURE_TRANSPARENT = (1U << 18U), + + /* Use shadow catcher. */ + KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U), + + /* Per-uber shader usage flags. */ + KERNEL_FEATURE_PRINCIPLED = (1U << 20U), + + /* Light render passes. */ + KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U), + + /* Shadow render pass. */ + KERNEL_FEATURE_SHADOW_PASS = (1U << 22U), +}; + +/* Shader node feature mask, to specialize shader evaluation for kernels. */ + +#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \ + (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA) +#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \ + (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \ + KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \ + KERNEL_FEATURE_NODE_VORONOI_EXTRA) +#define KERNEL_FEATURE_NODE_MASK_SURFACE \ + (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE) +#define KERNEL_FEATURE_NODE_MASK_VOLUME \ + (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA) +#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \ + (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE) +#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT + +#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U) + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h deleted file mode 100644 index f6b34be040e..00000000000 --- a/intern/cycles/kernel/kernel_volume.h +++ /dev/null @@ -1,1440 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Ignore paths that have volume throughput below this value, to avoid unnecessary work - * and precision issues. - * todo: this value could be tweaked or turned into a probability to avoid unnecessary - * work in volumes and subsurface scattering. */ -#define VOLUME_THROUGHPUT_EPSILON 1e-6f - -/* Events for probalistic scattering */ - -typedef enum VolumeIntegrateResult { - VOLUME_PATH_SCATTERED = 0, - VOLUME_PATH_ATTENUATED = 1, - VOLUME_PATH_MISSED = 2 -} VolumeIntegrateResult; - -/* Volume shader properties - * - * extinction coefficient = absorption coefficient + scattering coefficient - * sigma_t = sigma_a + sigma_s */ - -typedef struct VolumeShaderCoefficients { - float3 sigma_t; - float3 sigma_s; - float3 emission; -} VolumeShaderCoefficients; - -#ifdef __VOLUME__ - -/* evaluate shader to get extinction coefficient at P */ -ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - float3 P, - float3 *extinction) -{ - sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW); - - if (sd->flag & SD_EXTINCTION) { - const float density = object_volume_density(kg, sd->object); - *extinction = sd->closure_transparent_extinction * density; - return true; - } - else { - return false; - } -} - -/* evaluate shader to get absorption, scattering and emission at P */ -ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - float3 P, - VolumeShaderCoefficients *coeff) -{ - sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, state->flag); - - if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) - return false; - - coeff->sigma_s = zero_float3(); - coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3(); - coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3(); - - if (sd->flag & SD_SCATTER) { - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if (CLOSURE_IS_VOLUME(sc->type)) - coeff->sigma_s += sc->weight; - } - } - - const float density = object_volume_density(kg, sd->object); - coeff->sigma_s *= density; - coeff->sigma_t *= density; - coeff->emission *= density; - - return true; -} - -#endif /* __VOLUME__ */ - -ccl_device float3 volume_color_transmittance(float3 sigma, float t) -{ - return exp3(-sigma * t); -} - -ccl_device float kernel_volume_channel_get(float3 value, int channel) -{ - return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z); -} - -#ifdef __VOLUME__ - -ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack) -{ - float step_size = FLT_MAX; - - for (int i = 0; stack[i].shader != SHADER_NONE; i++) { - int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags; - - bool heterogeneous = false; - - if (shader_flag & SD_HETEROGENEOUS_VOLUME) { - heterogeneous = true; - } - else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) { - /* We want to render world or objects without any volume grids - * as homogeneous, but can only verify this at run-time since other - * heterogeneous volume objects may be using the same shader. */ - int object = stack[i].object; - if (object != OBJECT_NONE) { - int object_flag = kernel_tex_fetch(__object_flag, object); - if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) { - heterogeneous = true; - } - } - } - - if (heterogeneous) { - float object_step_size = object_volume_step_size(kg, stack[i].object); - object_step_size *= kernel_data.integrator.volume_step_rate; - step_size = fminf(object_step_size, step_size); - } - } - - return step_size; -} - -ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack) -{ - if (kernel_data.integrator.num_all_lights == 0) - return 0; - - int method = -1; - - for (int i = 0; stack[i].shader != SHADER_NONE; i++) { - int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags; - - if (shader_flag & SD_VOLUME_MIS) { - return SD_VOLUME_MIS; - } - else if (shader_flag & SD_VOLUME_EQUIANGULAR) { - if (method == 0) - return SD_VOLUME_MIS; - - method = SD_VOLUME_EQUIANGULAR; - } - else { - if (method == SD_VOLUME_EQUIANGULAR) - return SD_VOLUME_MIS; - - method = 0; - } - } - - return method; -} - -ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg, - ccl_addr_space PathState *state, - const float object_step_size, - float t, - float *step_size, - float *step_shade_offset, - float *steps_offset) -{ - const int max_steps = kernel_data.integrator.volume_max_steps; - float step = min(object_step_size, t); - - /* compute exact steps in advance for malloc */ - if (t > max_steps * step) { - step = t / (float)max_steps; - } - - *step_size = step; - - /* Perform shading at this offset within a step, to integrate over - * over the entire step segment. */ - *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4); - - /* Shift starting point of all segment by this random amount to avoid - * banding artifacts from the volume bounding shape. */ - *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3); -} - -/* Volume Shadows - * - * These functions are used to attenuate shadow rays to lights. Both absorption - * and scattering will block light, represented by the extinction coefficient. */ - -/* homogeneous volume: assume shader evaluation at the starts gives - * the extinction coefficient for the entire line segment */ -ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - ShaderData *sd, - float3 *throughput) -{ - float3 sigma_t = zero_float3(); - - if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t)) - *throughput *= volume_color_transmittance(sigma_t, ray->t); -} - -/* heterogeneous volume: integrate stepping through the volume until we - * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - ShaderData *sd, - float3 *throughput, - const float object_step_size) -{ - float3 tp = *throughput; - - /* Prepare for stepping. - * For shadows we do not offset all segments, since the starting point is - * already a random distance inside the volume. It also appears to create - * banding artifacts for unknown reasons. */ - int max_steps = kernel_data.integrator.volume_max_steps; - float step_size, step_shade_offset, unused; - kernel_volume_step_init( - kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused); - const float steps_offset = 1.0f; - - /* compute extinction at the start */ - float t = 0.0f; - - float3 sum = zero_float3(); - - for (int i = 0; i < max_steps; i++) { - /* advance to new position */ - float new_t = min(ray->t, (i + steps_offset) * step_size); - float dt = new_t - t; - - float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset); - float3 sigma_t = zero_float3(); - - /* compute attenuation over segment */ - if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) { - /* Compute expf() only for every Nth step, to save some calculations - * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON - * check then. */ - sum += (-sigma_t * dt); - if ((i & 0x07) == 0) { /* ToDo: Other interval? */ - tp = *throughput * exp3(sum); - - /* stop if nearly all light is blocked */ - if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON && - tp.z < VOLUME_THROUGHPUT_EPSILON) - break; - } - } - - /* stop if at the end of the volume */ - t = new_t; - if (t == ray->t) { - /* Update throughput in case we haven't done it above */ - tp = *throughput * exp3(sum); - break; - } - } - - *throughput = tp; -} - -/* get the volume attenuation over line segment defined by ray, with the - * assumption that there are no surfaces blocking light between the endpoints */ -# if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) -ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - Ray *ray, - float3 *throughput) -{ - optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput); -} -extern "C" __device__ void __direct_callable__kernel_volume_shadow( -# else -ccl_device_noinline void kernel_volume_shadow( -# endif - KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - Ray *ray, - float3 *throughput) -{ - shader_setup_from_volume(kg, shadow_sd, ray); - - float step_size = volume_stack_step_size(kg, state->volume_stack); - if (step_size != FLT_MAX) - kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size); - else - kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput); -} - -#endif /* __VOLUME__ */ - -/* Equi-angular sampling as in: - * "Importance Sampling Techniques for Path Tracing in Participating Media" */ - -ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf) -{ - float t = ray->t; - - float delta = dot((light_P - ray->P), ray->D); - float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); - if (UNLIKELY(D == 0.0f)) { - *pdf = 0.0f; - return 0.0f; - } - float theta_a = -atan2f(delta, D); - float theta_b = atan2f(t - delta, D); - float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a); - if (UNLIKELY(theta_b == theta_a)) { - *pdf = 0.0f; - return 0.0f; - } - *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); - - return min(t, delta + t_); /* min is only for float precision errors */ -} - -ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t) -{ - float delta = dot((light_P - ray->P), ray->D); - float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); - if (UNLIKELY(D == 0.0f)) { - return 0.0f; - } - - float t = ray->t; - float t_ = sample_t - delta; - - float theta_a = -atan2f(delta, D); - float theta_b = atan2f(t - delta, D); - if (UNLIKELY(theta_b == theta_a)) { - return 0.0f; - } - - float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); - - return pdf; -} - -/* Distance sampling */ - -ccl_device float kernel_volume_distance_sample( - float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf) -{ - /* xi is [0, 1[ so log(0) should never happen, division by zero is - * avoided because sample_sigma_t > 0 when SD_SCATTER is set */ - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); - float3 full_transmittance = volume_color_transmittance(sigma_t, max_t); - float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel); - - float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t); - - *transmittance = volume_color_transmittance(sigma_t, sample_t); - *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance); - - /* todo: optimization: when taken together with hit/miss decision, - * the full_transmittance cancels out drops out and xi does not - * need to be remapped */ - - return sample_t; -} - -ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t) -{ - float3 full_transmittance = volume_color_transmittance(sigma_t, max_t); - float3 transmittance = volume_color_transmittance(sigma_t, sample_t); - - return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance); -} - -/* Emission */ - -ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff, - int closure_flag, - float3 transmittance, - float t) -{ - /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t - * this goes to E * t as sigma_t goes to zero - * - * todo: we should use an epsilon to avoid precision issues near zero sigma_t */ - float3 emission = coeff->emission; - - if (closure_flag & SD_EXTINCTION) { - float3 sigma_t = coeff->sigma_t; - - emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t; - emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t; - emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t; - } - else - emission *= t; - - return emission; -} - -/* Volume Path */ - -ccl_device int kernel_volume_sample_channel(float3 albedo, - float3 throughput, - float rand, - float3 *pdf) -{ - /* Sample color channel proportional to throughput and single scattering - * albedo, to significantly reduce noise with many bounce, following: - * - * "Practical and Controllable Subsurface Scattering for Production Path - * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ - float3 weights = fabs(throughput * albedo); - float sum_weights = weights.x + weights.y + weights.z; - float3 weights_pdf; - - if (sum_weights > 0.0f) { - weights_pdf = weights / sum_weights; - } - else { - weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f); - } - - *pdf = weights_pdf; - - /* OpenCL does not support -> on float3, so don't use pdf->x. */ - if (rand < weights_pdf.x) { - return 0; - } - else if (rand < weights_pdf.x + weights_pdf.y) { - return 1; - } - else { - return 2; - } -} - -#ifdef __VOLUME__ - -/* homogeneous volume: assume shader evaluation at the start gives - * the volume shading coefficient for the entire line segment */ -ccl_device VolumeIntegrateResult -kernel_volume_integrate_homogeneous(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - ShaderData *sd, - PathRadiance *L, - ccl_addr_space float3 *throughput, - bool probalistic_scatter) -{ - VolumeShaderCoefficients coeff ccl_optional_struct_init; - - if (!volume_shader_sample(kg, sd, state, ray->P, &coeff)) - return VOLUME_PATH_MISSED; - - int closure_flag = sd->flag; - float t = ray->t; - float3 new_tp; - -# ifdef __VOLUME_SCATTER__ - /* randomly scatter, and if we do t is shortened */ - if (closure_flag & SD_SCATTER) { - /* Sample channel, use MIS with balance heuristic. */ - float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); - float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); - float3 channel_pdf; - int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf); - - /* decide if we will hit or miss */ - bool scatter = true; - float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); - - if (probalistic_scatter) { - float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel); - float sample_transmittance = expf(-sample_sigma_t * t); - - if (1.0f - xi >= sample_transmittance) { - scatter = true; - - /* rescale random number so we can reuse it */ - xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance); - } - else - scatter = false; - } - - if (scatter) { - /* scattering */ - float3 pdf; - float3 transmittance; - float sample_t; - - /* distance sampling */ - sample_t = kernel_volume_distance_sample( - ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf); - - /* modify pdf for hit/miss decision */ - if (probalistic_scatter) - pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t); - - new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf); - t = sample_t; - } - else { - /* no scattering */ - float3 transmittance = volume_color_transmittance(coeff.sigma_t, t); - float pdf = dot(channel_pdf, transmittance); - new_tp = *throughput * transmittance / pdf; - } - } - else -# endif - if (closure_flag & SD_EXTINCTION) { - /* absorption only, no sampling needed */ - float3 transmittance = volume_color_transmittance(coeff.sigma_t, t); - new_tp = *throughput * transmittance; - } - else { - new_tp = *throughput; - } - - /* integrate emission attenuated by extinction */ - if (L && (closure_flag & SD_EMISSION)) { - float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t); - float3 emission = kernel_volume_emission_integrate( - &coeff, closure_flag, transmittance, ray->t); - path_radiance_accum_emission(kg, L, state, *throughput, emission); - } - - /* modify throughput */ - if (closure_flag & SD_EXTINCTION) { - *throughput = new_tp; - - /* prepare to scatter to new direction */ - if (t < ray->t) { - /* adjust throughput and move to new location */ - sd->P = ray->P + t * ray->D; - - return VOLUME_PATH_SCATTERED; - } - } - - return VOLUME_PATH_ATTENUATED; -} - -/* heterogeneous volume distance sampling: integrate stepping through the - * volume until we reach the end, get absorbed entirely, or run out of - * iterations. this does probabilistically scatter or get transmitted through - * for path tracing where we don't want to branch. */ -ccl_device VolumeIntegrateResult -kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, - ccl_addr_space PathState *state, - Ray *ray, - ShaderData *sd, - PathRadiance *L, - ccl_addr_space float3 *throughput, - const float object_step_size) -{ - float3 tp = *throughput; - - /* Prepare for stepping. - * Using a different step offset for the first step avoids banding artifacts. */ - int max_steps = kernel_data.integrator.volume_max_steps; - float step_size, step_shade_offset, steps_offset; - kernel_volume_step_init( - kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset); - - /* compute coefficients at the start */ - float t = 0.0f; - float3 accum_transmittance = one_float3(); - - /* pick random color channel, we use the Veach one-sample - * model with balance heuristic for the channels */ - float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); - bool has_scatter = false; - - for (int i = 0; i < max_steps; i++) { - /* advance to new position */ - float new_t = min(ray->t, (i + steps_offset) * step_size); - float dt = new_t - t; - - float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset); - VolumeShaderCoefficients coeff ccl_optional_struct_init; - - /* compute segment */ - if (volume_shader_sample(kg, sd, state, new_P, &coeff)) { - int closure_flag = sd->flag; - float3 new_tp; - float3 transmittance; - bool scatter = false; - - /* distance sampling */ -# ifdef __VOLUME_SCATTER__ - if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) { - has_scatter = true; - - /* Sample channel, use MIS with balance heuristic. */ - float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); - float3 channel_pdf; - int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf); - - /* compute transmittance over full step */ - transmittance = volume_color_transmittance(coeff.sigma_t, dt); - - /* decide if we will scatter or continue */ - float sample_transmittance = kernel_volume_channel_get(transmittance, channel); - - if (1.0f - xi >= sample_transmittance) { - /* compute sampling distance */ - float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel); - float new_dt = -logf(1.0f - xi) / sample_sigma_t; - new_t = t + new_dt; - - /* transmittance and pdf */ - float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); - float3 pdf = coeff.sigma_t * new_transmittance; - - /* throughput */ - new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf); - scatter = true; - } - else { - /* throughput */ - float pdf = dot(channel_pdf, transmittance); - new_tp = tp * transmittance / pdf; - - /* remap xi so we can reuse it and keep thing stratified */ - xi = 1.0f - (1.0f - xi) / sample_transmittance; - } - } - else -# endif - if (closure_flag & SD_EXTINCTION) { - /* absorption only, no sampling needed */ - transmittance = volume_color_transmittance(coeff.sigma_t, dt); - new_tp = tp * transmittance; - } - else { - transmittance = zero_float3(); - new_tp = tp; - } - - /* integrate emission attenuated by absorption */ - if (L && (closure_flag & SD_EMISSION)) { - float3 emission = kernel_volume_emission_integrate( - &coeff, closure_flag, transmittance, dt); - path_radiance_accum_emission(kg, L, state, tp, emission); - } - - /* modify throughput */ - if (closure_flag & SD_EXTINCTION) { - tp = new_tp; - - /* stop if nearly all light blocked */ - if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON && - tp.z < VOLUME_THROUGHPUT_EPSILON) { - tp = zero_float3(); - break; - } - } - - /* prepare to scatter to new direction */ - if (scatter) { - /* adjust throughput and move to new location */ - sd->P = ray->P + new_t * ray->D; - *throughput = tp; - - return VOLUME_PATH_SCATTERED; - } - else { - /* accumulate transmittance */ - accum_transmittance *= transmittance; - } - } - - /* stop if at the end of the volume */ - t = new_t; - if (t == ray->t) - break; - } - - *throughput = tp; - - return VOLUME_PATH_ATTENUATED; -} - -/* get the volume attenuation and emission over line segment defined by - * ray, with the assumption that there are no surfaces blocking light - * between the endpoints. distance sampling is used to decide if we will - * scatter or not. */ -ccl_device_noinline_cpu VolumeIntegrateResult -kernel_volume_integrate(KernelGlobals *kg, - ccl_addr_space PathState *state, - ShaderData *sd, - Ray *ray, - PathRadiance *L, - ccl_addr_space float3 *throughput, - float step_size) -{ - shader_setup_from_volume(kg, sd, ray); - - if (step_size != FLT_MAX) - return kernel_volume_integrate_heterogeneous_distance( - kg, state, ray, sd, L, throughput, step_size); - else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true); -} - -# ifndef __SPLIT_KERNEL__ -/* Decoupled Volume Sampling - * - * VolumeSegment is list of coefficients and transmittance stored at all steps - * through a volume. This can then later be used for decoupled sampling as in: - * "Importance Sampling Techniques for Path Tracing in Participating Media" - * - * On the GPU this is only supported (but currently not enabled) - * for homogeneous volumes (1 step), due to - * no support for malloc/free and too much stack usage with a fix size array. */ - -typedef struct VolumeStep { - float3 sigma_s; /* scatter coefficient */ - float3 sigma_t; /* extinction coefficient */ - float3 accum_transmittance; /* accumulated transmittance including this step */ - float3 cdf_distance; /* cumulative density function for distance sampling */ - float t; /* distance at end of this step */ - float shade_t; /* jittered distance where shading was done in step */ - int closure_flag; /* shader evaluation closure flags */ -} VolumeStep; - -typedef struct VolumeSegment { - VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */ - VolumeStep *steps; /* recorded steps */ - int numsteps; /* number of steps */ - int closure_flag; /* accumulated closure flags from all steps */ - - float3 accum_emission; /* accumulated emission at end of segment */ - float3 accum_transmittance; /* accumulated transmittance at end of segment */ - float3 accum_albedo; /* accumulated average albedo over segment */ - - int sampling_method; /* volume sampling method */ -} VolumeSegment; - -/* record volume steps to the end of the volume. - * - * it would be nice if we could only record up to the point that we need to scatter, - * but the entire segment is needed to do always scattering, rather than probabilistically - * hitting or missing the volume. if we don't know the transmittance at the end of the - * volume we can't generate stratified distance samples up to that transmittance */ -# ifdef __VOLUME_DECOUPLED__ -ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, - PathState *state, - Ray *ray, - ShaderData *sd, - VolumeSegment *segment, - const float object_step_size) -{ - /* prepare for volume stepping */ - int max_steps; - float step_size, step_shade_offset, steps_offset; - - if (object_step_size != FLT_MAX) { - max_steps = kernel_data.integrator.volume_max_steps; - kernel_volume_step_init( - kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset); - -# ifdef __KERNEL_CPU__ - /* NOTE: For the branched path tracing it's possible to have direct - * and indirect light integration both having volume segments allocated. - * We detect this using index in the pre-allocated memory. Currently we - * only support two segments allocated at a time, if more needed some - * modifications to the KernelGlobals will be needed. - * - * This gives us restrictions that decoupled record should only happen - * in the stack manner, meaning if there's subsequent call of decoupled - * record it'll need to free memory before its caller frees memory. - */ - const int index = kg->decoupled_volume_steps_index; - assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps)); - if (kg->decoupled_volume_steps[index] == NULL) { - kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps); - } - segment->steps = kg->decoupled_volume_steps[index]; - ++kg->decoupled_volume_steps_index; -# else - segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps); -# endif - } - else { - max_steps = 1; - step_size = ray->t; - step_shade_offset = 0.0f; - steps_offset = 1.0f; - segment->steps = &segment->stack_step; - } - - /* init accumulation variables */ - float3 accum_emission = zero_float3(); - float3 accum_transmittance = one_float3(); - float3 accum_albedo = zero_float3(); - float3 cdf_distance = zero_float3(); - float t = 0.0f; - - segment->numsteps = 0; - segment->closure_flag = 0; - bool is_last_step_empty = false; - - VolumeStep *step = segment->steps; - - for (int i = 0; i < max_steps; i++, step++) { - /* advance to new position */ - float new_t = min(ray->t, (i + steps_offset) * step_size); - float dt = new_t - t; - - float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset); - VolumeShaderCoefficients coeff ccl_optional_struct_init; - - /* compute segment */ - if (volume_shader_sample(kg, sd, state, new_P, &coeff)) { - int closure_flag = sd->flag; - float3 sigma_t = coeff.sigma_t; - - /* compute average albedo for channel sampling */ - if (closure_flag & SD_SCATTER) { - accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t); - } - - /* compute accumulated transmittance */ - float3 transmittance = volume_color_transmittance(sigma_t, dt); - - /* compute emission attenuated by absorption */ - if (closure_flag & SD_EMISSION) { - float3 emission = kernel_volume_emission_integrate( - &coeff, closure_flag, transmittance, dt); - accum_emission += accum_transmittance * emission; - } - - accum_transmittance *= transmittance; - - /* compute pdf for distance sampling */ - float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s; - cdf_distance = cdf_distance + pdf_distance; - - /* write step data */ - step->sigma_t = sigma_t; - step->sigma_s = coeff.sigma_s; - step->closure_flag = closure_flag; - - segment->closure_flag |= closure_flag; - - is_last_step_empty = false; - segment->numsteps++; - } - else { - if (is_last_step_empty) { - /* consecutive empty step, merge */ - step--; - } - else { - /* store empty step */ - step->sigma_t = zero_float3(); - step->sigma_s = zero_float3(); - step->closure_flag = 0; - - segment->numsteps++; - is_last_step_empty = true; - } - } - - step->accum_transmittance = accum_transmittance; - step->cdf_distance = cdf_distance; - step->t = new_t; - step->shade_t = t + dt * step_shade_offset; - - /* stop if at the end of the volume */ - t = new_t; - if (t == ray->t) - break; - - /* stop if nearly all light blocked */ - if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON && - accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON && - accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON) - break; - } - - /* store total emission and transmittance */ - segment->accum_emission = accum_emission; - segment->accum_transmittance = accum_transmittance; - segment->accum_albedo = accum_albedo; - - /* normalize cumulative density function for distance sampling */ - VolumeStep *last_step = segment->steps + segment->numsteps - 1; - - if (!is_zero(last_step->cdf_distance)) { - VolumeStep *step = &segment->steps[0]; - int numsteps = segment->numsteps; - float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance); - - for (int i = 0; i < numsteps; i++, step++) - step->cdf_distance *= inv_cdf_distance_sum; - } -} - -ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment) -{ - if (segment->steps != &segment->stack_step) { -# ifdef __KERNEL_CPU__ - /* NOTE: We only allow free last allocated segment. - * No random order of alloc/free is supported. - */ - assert(kg->decoupled_volume_steps_index > 0); - assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]); - --kg->decoupled_volume_steps_index; -# else - free(segment->steps); -# endif - } -} -# endif /* __VOLUME_DECOUPLED__ */ - -/* scattering for homogeneous and heterogeneous volumes, using decoupled ray - * marching. - * - * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */ -ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg, - PathState *state, - Ray *ray, - ShaderData *sd, - float3 *throughput, - float rphase, - float rscatter, - const VolumeSegment *segment, - const float3 *light_P, - bool probalistic_scatter) -{ - kernel_assert(segment->closure_flag & SD_SCATTER); - - /* Sample color channel, use MIS with balance heuristic. */ - float3 channel_pdf; - int channel = kernel_volume_sample_channel( - segment->accum_albedo, *throughput, rphase, &channel_pdf); - - float xi = rscatter; - - /* probabilistic scattering decision based on transmittance */ - if (probalistic_scatter) { - float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel); - - if (1.0f - xi >= sample_transmittance) { - /* rescale random number so we can reuse it */ - xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance); - } - else { - *throughput /= sample_transmittance; - return VOLUME_PATH_MISSED; - } - } - - VolumeStep *step; - float3 transmittance; - float pdf, sample_t; - float mis_weight = 1.0f; - bool distance_sample = true; - bool use_mis = false; - - if (segment->sampling_method && light_P) { - if (segment->sampling_method == SD_VOLUME_MIS) { - /* multiple importance sample: randomly pick between - * equiangular and distance sampling strategy */ - if (xi < 0.5f) { - xi *= 2.0f; - } - else { - xi = (xi - 0.5f) * 2.0f; - distance_sample = false; - } - - use_mis = true; - } - else { - /* only equiangular sampling */ - distance_sample = false; - } - } - - /* distance sampling */ - if (distance_sample) { - /* find step in cdf */ - step = segment->steps; - - float prev_t = 0.0f; - float3 step_pdf_distance = one_float3(); - - if (segment->numsteps > 1) { - float prev_cdf = 0.0f; - float step_cdf = 1.0f; - float3 prev_cdf_distance = zero_float3(); - - for (int i = 0;; i++, step++) { - /* todo: optimize using binary search */ - step_cdf = kernel_volume_channel_get(step->cdf_distance, channel); - - if (xi < step_cdf || i == segment->numsteps - 1) - break; - - prev_cdf = step_cdf; - prev_t = step->t; - prev_cdf_distance = step->cdf_distance; - } - - /* remap xi so we can reuse it */ - xi = (xi - prev_cdf) / (step_cdf - prev_cdf); - - /* pdf for picking step */ - step_pdf_distance = step->cdf_distance - prev_cdf_distance; - } - - /* determine range in which we will sample */ - float step_t = step->t - prev_t; - - /* sample distance and compute transmittance */ - float3 distance_pdf; - sample_t = prev_t + kernel_volume_distance_sample( - step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf); - - /* modify pdf for hit/miss decision */ - if (probalistic_scatter) - distance_pdf *= one_float3() - segment->accum_transmittance; - - pdf = dot(channel_pdf, distance_pdf * step_pdf_distance); - - /* multiple importance sampling */ - if (use_mis) { - float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t); - mis_weight = 2.0f * power_heuristic(pdf, equi_pdf); - } - } - /* equi-angular sampling */ - else { - /* sample distance */ - sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf); - - /* find step in which sampled distance is located */ - step = segment->steps; - - float prev_t = 0.0f; - float3 step_pdf_distance = one_float3(); - - if (segment->numsteps > 1) { - float3 prev_cdf_distance = zero_float3(); - - int numsteps = segment->numsteps; - int high = numsteps - 1; - int low = 0; - int mid; - - while (low < high) { - mid = (low + high) >> 1; - - if (sample_t < step[mid].t) - high = mid; - else if (sample_t >= step[mid + 1].t) - low = mid + 1; - else { - /* found our interval in step[mid] .. step[mid+1] */ - prev_t = step[mid].t; - prev_cdf_distance = step[mid].cdf_distance; - step += mid + 1; - break; - } - } - - if (low >= numsteps - 1) { - prev_t = step[numsteps - 1].t; - prev_cdf_distance = step[numsteps - 1].cdf_distance; - step += numsteps - 1; - } - - /* pdf for picking step with distance sampling */ - step_pdf_distance = step->cdf_distance - prev_cdf_distance; - } - - /* determine range in which we will sample */ - float step_t = step->t - prev_t; - float step_sample_t = sample_t - prev_t; - - /* compute transmittance */ - transmittance = volume_color_transmittance(step->sigma_t, step_sample_t); - - /* multiple importance sampling */ - if (use_mis) { - float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t); - float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance); - mis_weight = 2.0f * power_heuristic(pdf, distance_pdf); - } - } - if (sample_t < 0.0f || pdf == 0.0f) { - return VOLUME_PATH_MISSED; - } - - /* compute transmittance up to this step */ - if (step != segment->steps) - transmittance *= (step - 1)->accum_transmittance; - - /* modify throughput */ - *throughput *= step->sigma_s * transmittance * (mis_weight / pdf); - - /* evaluate shader to create closures at shading point */ - if (segment->numsteps > 1) { - sd->P = ray->P + step->shade_t * ray->D; - - VolumeShaderCoefficients coeff; - volume_shader_sample(kg, sd, state, sd->P, &coeff); - } - - /* move to new position */ - sd->P = ray->P + sample_t * ray->D; - - return VOLUME_PATH_SCATTERED; -} -# endif /* __SPLIT_KERNEL */ - -/* decide if we need to use decoupled or not */ -ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, - bool heterogeneous, - bool direct, - int sampling_method) -{ - /* decoupled ray marching for heterogeneous volumes not supported on the GPU, - * which also means equiangular and multiple importance sampling is not - * support for that case */ - if (!kernel_data.integrator.volume_decoupled) - return false; - -# ifdef __KERNEL_GPU__ - if (heterogeneous) - return false; -# endif - - /* equiangular and multiple importance sampling only implemented for decoupled */ - if (sampling_method != 0) - return true; - - /* for all light sampling use decoupled, reusing shader evaluations is - * typically faster in that case */ - if (direct) - return kernel_data.integrator.sample_all_lights_direct; - else - return kernel_data.integrator.sample_all_lights_indirect; -} - -/* Volume Stack - * - * This is an array of object/shared ID's that the current segment of the path - * is inside of. */ - -ccl_device void kernel_volume_stack_init(KernelGlobals *kg, - ShaderData *stack_sd, - ccl_addr_space const PathState *state, - ccl_addr_space const Ray *ray, - ccl_addr_space VolumeStack *stack) -{ - /* NULL ray happens in the baker, does it need proper initialization of - * camera in volume? - */ - if (!kernel_data.cam.is_inside_volume || ray == NULL) { - /* Camera is guaranteed to be in the air, only take background volume - * into account in this case. - */ - if (kernel_data.background.volume_shader != SHADER_NONE) { - stack[0].shader = kernel_data.background.volume_shader; - stack[0].object = PRIM_NONE; - stack[1].shader = SHADER_NONE; - } - else { - stack[0].shader = SHADER_NONE; - } - return; - } - - kernel_assert(state->flag & PATH_RAY_CAMERA); - - Ray volume_ray = *ray; - volume_ray.t = FLT_MAX; - - const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY); - int stack_index = 0, enclosed_index = 0; - -# ifdef __VOLUME_RECORD_ALL__ - Intersection hits[2 * VOLUME_STACK_SIZE + 1]; - uint num_hits = scene_intersect_volume_all( - kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility); - if (num_hits > 0) { - int enclosed_volumes[VOLUME_STACK_SIZE]; - Intersection *isect = hits; - - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); - - for (uint hit = 0; hit < num_hits; ++hit, ++isect) { - shader_setup_from_ray(kg, stack_sd, isect, &volume_ray); - if (stack_sd->flag & SD_BACKFACING) { - bool need_add = true; - for (int i = 0; i < enclosed_index && need_add; ++i) { - /* If ray exited the volume and never entered to that volume - * it means that camera is inside such a volume. - */ - if (enclosed_volumes[i] == stack_sd->object) { - need_add = false; - } - } - for (int i = 0; i < stack_index && need_add; ++i) { - /* Don't add intersections twice. */ - if (stack[i].object == stack_sd->object) { - need_add = false; - break; - } - } - if (need_add && stack_index < VOLUME_STACK_SIZE - 1) { - stack[stack_index].object = stack_sd->object; - stack[stack_index].shader = stack_sd->shader; - ++stack_index; - } - } - else { - /* If ray from camera enters the volume, this volume shouldn't - * be added to the stack on exit. - */ - enclosed_volumes[enclosed_index++] = stack_sd->object; - } - } - } -# else - int enclosed_volumes[VOLUME_STACK_SIZE]; - int step = 0; - - while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 && - step < 2 * VOLUME_STACK_SIZE) { - Intersection isect; - if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) { - break; - } - - shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray); - if (stack_sd->flag & SD_BACKFACING) { - /* If ray exited the volume and never entered to that volume - * it means that camera is inside such a volume. - */ - bool need_add = true; - for (int i = 0; i < enclosed_index && need_add; ++i) { - /* If ray exited the volume and never entered to that volume - * it means that camera is inside such a volume. - */ - if (enclosed_volumes[i] == stack_sd->object) { - need_add = false; - } - } - for (int i = 0; i < stack_index && need_add; ++i) { - /* Don't add intersections twice. */ - if (stack[i].object == stack_sd->object) { - need_add = false; - break; - } - } - if (need_add) { - stack[stack_index].object = stack_sd->object; - stack[stack_index].shader = stack_sd->shader; - ++stack_index; - } - } - else { - /* If ray from camera enters the volume, this volume shouldn't - * be added to the stack on exit. - */ - enclosed_volumes[enclosed_index++] = stack_sd->object; - } - - /* Move ray forward. */ - volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng); - ++step; - } -# endif - /* stack_index of 0 means quick checks outside of the kernel gave false - * positive, nothing to worry about, just we've wasted quite a few of - * ticks just to come into conclusion that camera is in the air. - * - * In this case we're doing the same above -- check whether background has - * volume. - */ - if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) { - stack[0].shader = kernel_data.background.volume_shader; - stack[0].object = OBJECT_NONE; - stack[1].shader = SHADER_NONE; - } - else { - stack[stack_index].shader = SHADER_NONE; - } -} - -ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space VolumeStack *stack) -{ - /* todo: we should have some way for objects to indicate if they want the - * world shader to work inside them. excluding it by default is problematic - * because non-volume objects can't be assumed to be closed manifolds */ - - if (!(sd->flag & SD_HAS_VOLUME)) - return; - - if (sd->flag & SD_BACKFACING) { - /* exit volume object: remove from stack */ - for (int i = 0; stack[i].shader != SHADER_NONE; i++) { - if (stack[i].object == sd->object) { - /* shift back next stack entries */ - do { - stack[i] = stack[i + 1]; - i++; - } while (stack[i].shader != SHADER_NONE); - - return; - } - } - } - else { - /* enter volume object: add to stack */ - int i; - - for (i = 0; stack[i].shader != SHADER_NONE; i++) { - /* already in the stack? then we have nothing to do */ - if (stack[i].object == sd->object) - return; - } - - /* if we exceed the stack limit, ignore */ - if (i >= VOLUME_STACK_SIZE - 1) - return; - - /* add to the end of the stack */ - stack[i].shader = sd->shader; - stack[i].object = sd->object; - stack[i + 1].shader = SHADER_NONE; - } -} - -# ifdef __SUBSURFACE__ -ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, - ShaderData *stack_sd, - Ray *ray, - ccl_addr_space VolumeStack *stack) -{ - kernel_assert(kernel_data.integrator.use_volumes); - - Ray volume_ray = *ray; - -# ifdef __VOLUME_RECORD_ALL__ - Intersection hits[2 * VOLUME_STACK_SIZE + 1]; - uint num_hits = scene_intersect_volume_all( - kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY); - if (num_hits > 0) { - Intersection *isect = hits; - - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); - - for (uint hit = 0; hit < num_hits; ++hit, ++isect) { - shader_setup_from_ray(kg, stack_sd, isect, &volume_ray); - kernel_volume_stack_enter_exit(kg, stack_sd, stack); - } - } -# else - Intersection isect; - int step = 0; - float3 Pend = ray->P + ray->D * ray->t; - while (step < 2 * VOLUME_STACK_SIZE && - scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) { - shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray); - kernel_volume_stack_enter_exit(kg, stack_sd, stack); - - /* Move ray forward. */ - volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng); - if (volume_ray.t != FLT_MAX) { - volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t); - } - ++step; - } -# endif -} -# endif - -/* Clean stack after the last bounce. - * - * It is expected that all volumes are closed manifolds, so at the time when ray - * hits nothing (for example, it is a last bounce which goes to environment) the - * only expected volume in the stack is the world's one. All the rest volume - * entries should have been exited already. - * - * This isn't always true because of ray intersection precision issues, which - * could lead us to an infinite non-world volume in the stack, causing render - * artifacts. - * - * Use this function after the last bounce to get rid of all volumes apart from - * the world's one after the last bounce to avoid render artifacts. - */ -ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, - ccl_addr_space VolumeStack *volume_stack) -{ - if (kernel_data.background.volume_shader != SHADER_NONE) { - /* Keep the world's volume in stack. */ - volume_stack[1].shader = SHADER_NONE; - } - else { - volume_stack[0].shader = SHADER_NONE; - } -} - -#endif /* __VOLUME__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index d1602744f1d..fab0915c38e 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_WORK_STEALING_H__ -#define __KERNEL_WORK_STEALING_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN */ /* Map global work index to tile, pixel X/Y and sample. */ -ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, +ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample) { -#ifdef __KERNEL_CUDA__ - /* Keeping threads for the same pixel together improves performance on CUDA. */ - uint sample_offset = global_work_index % tile->num_samples; - uint pixel_offset = global_work_index / tile->num_samples; -#else /* __KERNEL_CUDA__ */ +#if 0 + /* Keep threads for the same sample together. */ uint tile_pixels = tile->w * tile->h; uint sample_offset = global_work_index / tile_pixels; uint pixel_offset = global_work_index - sample_offset * tile_pixels; -#endif /* __KERNEL_CUDA__ */ +#else + /* Keeping threads for the same pixel together. + * Appears to improve performance by a few % on CUDA and OptiX. */ + uint sample_offset = global_work_index % tile->num_samples; + uint pixel_offset = global_work_index / tile->num_samples; +#endif + uint y_offset = pixel_offset / tile->w; uint x_offset = pixel_offset - y_offset * tile->w; @@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, *sample = tile->start_sample + sample_offset; } -#ifdef __KERNEL_OPENCL__ -# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#endif - -#ifdef __SPLIT_KERNEL__ -/* Returns true if there is work */ -ccl_device bool get_next_work_item(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) -{ - /* With a small amount of work there may be more threads than work due to - * rounding up of global size, stop such threads immediately. */ - if (ray_index >= total_work_size) { - return false; - } - - /* Increase atomic work index counter in pool. */ - uint pool = ray_index / WORK_POOL_SIZE; - uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); - - /* Map per-pool work index to a global work index. */ - uint global_size = ccl_global_size(0) * ccl_global_size(1); - kernel_assert(global_size % WORK_POOL_SIZE == 0); - kernel_assert(ray_index < global_size); - - *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) + - (work_index % WORK_POOL_SIZE); - - /* Test if all work for this pool is done. */ - return (*global_work_index < total_work_size); -} - -ccl_device bool get_next_work(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) -{ - bool got_work = false; - if (kernel_data.film.pass_adaptive_aux_buffer) { - do { - got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); - if (got_work) { - ccl_global WorkTile *tile = &kernel_split_params.tile; - uint x, y, sample; - get_work_pixel(tile, *global_work_index, &x, &y, &sample); - uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - break; - } - } - } while (got_work); - } - else { - got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); - } - return got_work; -} -#endif - CCL_NAMESPACE_END - -#endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h index 410218d91d4..9d379495629 100644 --- a/intern/cycles/kernel/kernel_write_passes.h +++ b/intern/cycles/kernel/kernel_write_passes.h @@ -14,23 +14,25 @@ * limitations under the License. */ -#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) +#pragma once + +#ifdef __KERNEL_GPU__ # define __ATOMIC_PASS_WRITE__ #endif CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) +ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value) { - ccl_global float *buf = buffer; #ifdef __ATOMIC_PASS_WRITE__ - atomic_add_and_fetch_float(buf, value); + atomic_add_and_fetch_float(buffer, value); #else - *buf += value; + *buffer += value; #endif } -ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value) +ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer, + float3 value) { #ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; @@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 atomic_add_and_fetch_float(buf_y, value.y); atomic_add_and_fetch_float(buf_z, value.z); #else - ccl_global float3 *buf = (ccl_global float3 *)buffer; - *buf += value; + buffer[0] += value.x; + buffer[1] += value.y; + buffer[2] += value.z; #endif } -ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value) +ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer, + float4 value) { #ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; @@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 atomic_add_and_fetch_float(buf_z, value.z); atomic_add_and_fetch_float(buf_w, value.w); #else - ccl_global float4 *buf = (ccl_global float4 *)buffer; - *buf += value; + buffer[0] += value.x; + buffer[1] += value.y; + buffer[2] += value.z; + buffer[3] += value.w; #endif } -#ifdef __DENOISING_FEATURES__ -ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value) +ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer) { - kernel_write_pass_float(buffer, value); - - /* The online one-pass variance update that's used for the megakernel can't easily be implemented - * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ - kernel_write_pass_float(buffer + 1, value * value); + return *buffer; } -# ifdef __ATOMIC_PASS_WRITE__ -# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 -# else -ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value) +ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer) { - buffer[0] += value.x; - buffer[1] += value.y; - buffer[2] += value.z; + return make_float3(buffer[0], buffer[1], buffer[2]); } -# endif -ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value) +ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer) { - kernel_write_pass_float3_unaligned(buffer, value); - kernel_write_pass_float3_unaligned(buffer + 3, value * value); + return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]); } -#endif /* __DENOISING_FEATURES__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp deleted file mode 100644 index 145a6b6ac40..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CPU kernel entry points */ - -/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this - * one with SSE2 intrinsics. - */ -#if defined(__x86_64__) || defined(_M_X64) -# define __KERNEL_SSE2__ -#endif - -/* When building kernel for native machine detect kernel features from the flags - * set by compiler. - */ -#ifdef WITH_KERNEL_NATIVE -# ifdef __SSE2__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# endif -# ifdef __SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifdef __SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifdef __SSE4_1__ -# define __KERNEL_SSE41__ -# endif -# ifdef __AVX__ -# define __KERNEL_SSE__ -# define __KERNEL_AVX__ -# endif -# ifdef __AVX2__ -# define __KERNEL_SSE__ -# define __KERNEL_AVX2__ -# endif -#endif - -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) -/* do nothing */ -#endif - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp deleted file mode 100644 index 012daba62d8..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with AVX - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp deleted file mode 100644 index 16351a7f949..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with AVX2 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h deleted file mode 100644 index 1423b182ab8..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Templated common declaration part of all CPU kernels. */ - -void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - TileInfo *tile_info, - int x, - int y, - float *unfilteredA, - float *unfilteredB, - float *sampleV, - float *sampleVV, - float *bufferV, - int *prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset); - -void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - TileInfo *tile_info, - int m_offset, - int v_offset, - int x, - int y, - float *mean, - float *variance, - float scale, - int *prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset); - -void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, - int x, - int y, - int *buffer_params, - float *from, - float *buffer, - int out_offset, - int *prefilter_rect); - -void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, - int y, - ccl_global float *image, - ccl_global float *variance, - ccl_global float *depth, - ccl_global float *output, - int *rect, - int pass_stride); - -void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)( - int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r); - -void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer, - TileInfo *tiles, - int x, - int y, - int storage_ofs, - float *transform, - int *rank, - int *rect, - int pass_stride, - int frame_stride, - bool use_time, - int radius, - float pca_threshold); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, - int dy, - float *weight_image, - float *variance_image, - float *scale_image, - float *difference_image, - int *rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)( - float *difference_image, float *out_image, int *rect, int stride, int f); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)( - float *difference_image, float *out_image, int *rect, int stride, int f); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, - int dy, - float *difference_image, - float *image, - float *temp_image, - float *out_image, - float *accum_image, - int *rect, - int channel_offset, - int stride, - int f); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, - int dy, - int t, - float *difference_image, - float *buffer, - float *transform, - int *rank, - float *XtWX, - float3 *XtWY, - int *rect, - int *filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time); - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, - float *accum_image, - int *rect, - int stride); - -void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, - int y, - int storage_ofs, - float *buffer, - int *rank, - float *XtWX, - float3 *XtWY, - int *buffer_params, - int sample); - -#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h deleted file mode 100644 index 3d4cb87e104..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Templated common implementation part of all CPU kernels. - * - * The idea is that particular .cpp files sets needed optimization flags and - * simply includes this file without worry of copying actual implementation over. - */ - -#include "kernel/kernel_compat_cpu.h" - -#include "kernel/filter/filter_kernel.h" - -#ifdef KERNEL_STUB -# define STUB_ASSERT(arch, name) \ - assert(!(#name " kernel stub for architecture " #arch " was called!")) -#endif - -CCL_NAMESPACE_BEGIN - -/* Denoise filter */ - -void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - TileInfo *tile_info, - int x, - int y, - float *unfilteredA, - float *unfilteredB, - float *sampleVariance, - float *sampleVarianceV, - float *bufferVariance, - int *prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); -#else - kernel_filter_divide_shadow(sample, - tile_info, - x, - y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - load_int4(prefilter_rect), - buffer_pass_stride, - buffer_denoising_offset); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - TileInfo *tile_info, - int m_offset, - int v_offset, - int x, - int y, - float *mean, - float *variance, - float scale, - int *prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_get_feature); -#else - kernel_filter_get_feature(sample, - tile_info, - m_offset, - v_offset, - x, - y, - mean, - variance, - scale, - load_int4(prefilter_rect), - buffer_pass_stride, - buffer_denoising_offset); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, - int x, - int y, - int *buffer_params, - float *from, - float *buffer, - int out_offset, - int *prefilter_rect) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_write_feature); -#else - kernel_filter_write_feature( - sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, - int y, - ccl_global float *image, - ccl_global float *variance, - ccl_global float *depth, - ccl_global float *output, - int *rect, - int pass_stride) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); -#else - kernel_filter_detect_outliers( - x, y, image, variance, depth, output, load_int4(rect), pass_stride); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)( - int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); -#else - kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer, - TileInfo *tile_info, - int x, - int y, - int storage_ofs, - float *transform, - int *rank, - int *prefilter_rect, - int pass_stride, - int frame_stride, - bool use_time, - int radius, - float pca_threshold) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); -#else - rank += storage_ofs; - transform += storage_ofs * TRANSFORM_SIZE; - kernel_filter_construct_transform(buffer, - tile_info, - x, - y, - load_int4(prefilter_rect), - pass_stride, - frame_stride, - use_time, - transform, - rank, - radius, - pca_threshold); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, - int dy, - float *weight_image, - float *variance_image, - float *scale_image, - float *difference_image, - int *rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); -#else - kernel_filter_nlm_calc_difference(dx, - dy, - weight_image, - variance_image, - scale_image, - difference_image, - load_int4(rect), - stride, - channel_offset, - frame_offset, - a, - k_2); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)( - float *difference_image, float *out_image, int *rect, int stride, int f) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); -#else - kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)( - float *difference_image, float *out_image, int *rect, int stride, int f) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); -#else - kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, - int dy, - float *difference_image, - float *image, - float *temp_image, - float *out_image, - float *accum_image, - int *rect, - int channel_offset, - int stride, - int f) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); -#else - kernel_filter_nlm_update_output(dx, - dy, - difference_image, - image, - temp_image, - out_image, - accum_image, - load_int4(rect), - channel_offset, - stride, - f); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, - int dy, - int t, - float *difference_image, - float *buffer, - float *transform, - int *rank, - float *XtWX, - float3 *XtWY, - int *rect, - int *filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); -#else - kernel_filter_nlm_construct_gramian(dx, - dy, - t, - difference_image, - buffer, - transform, - rank, - XtWX, - XtWY, - load_int4(rect), - load_int4(filter_window), - stride, - f, - pass_stride, - frame_offset, - use_time); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, - float *accum_image, - int *rect, - int stride) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); -#else - kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride); -#endif -} - -void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, - int y, - int storage_ofs, - float *buffer, - int *rank, - float *XtWX, - float3 *XtWY, - int *buffer_params, - int sample) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, filter_finalize); -#else - XtWX += storage_ofs * XTWX_SIZE; - XtWY += storage_ofs * XTWY_SIZE; - rank += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); -#endif -} - -#undef KERNEL_STUB -#undef STUB_ASSERT -#undef KERNEL_ARCH - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp deleted file mode 100644 index 75833d83648..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE2 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp deleted file mode 100644 index c998cd54d3a..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp deleted file mode 100644 index fc4ef1fca5b..00000000000 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#include "kernel/filter/filter.h" -#define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h deleted file mode 100644 index ea3103f12c3..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Templated common declaration part of all CPU kernels. */ - -void KERNEL_FUNCTION_FULL_NAME(path_trace)( - KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride); - -void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, - uchar4 *rgba, - float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride); - -void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, - uchar4 *rgba, - float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride); - -void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, - uint4 *input, - float4 *output, - int type, - int filter, - int i, - int offset, - int sample); - -void KERNEL_FUNCTION_FULL_NAME(bake)( - KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride); - -/* Split kernels */ - -void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - int start_sample, - int end_sample, - int sx, - int sy, - int sw, - int sh, - int offset, - int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); - -#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data); - -DECLARE_SPLIT_KERNEL_FUNCTION(path_init) -DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) -DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) -DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) -DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) -DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) -DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) -DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) -DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) -DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) -DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) -DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) -DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) -DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) -DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) -DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) -DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) -DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) -DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) -DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) -DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) -DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) - -#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h deleted file mode 100644 index 51d6c23f72f..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Templated common implementation part of all CPU kernels. - * - * The idea is that particular .cpp files sets needed optimization flags and - * simply includes this file without worry of copying actual implementation over. - */ - -// clang-format off -#include "kernel/kernel_compat_cpu.h" - -#ifndef KERNEL_STUB -# ifndef __SPLIT_KERNEL__ -# include "kernel/kernel_math.h" -# include "kernel/kernel_types.h" - -# include "kernel/split/kernel_split_data.h" -# include "kernel/kernel_globals.h" - -# include "kernel/kernel_color.h" -# include "kernel/kernels/cpu/kernel_cpu_image.h" -# include "kernel/kernel_film.h" -# include "kernel/kernel_path.h" -# include "kernel/kernel_path_branched.h" -# include "kernel/kernel_bake.h" -# else -# include "kernel/split/kernel_split_common.h" - -# include "kernel/split/kernel_data_init.h" -# include "kernel/split/kernel_path_init.h" -# include "kernel/split/kernel_scene_intersect.h" -# include "kernel/split/kernel_lamp_emission.h" -# include "kernel/split/kernel_do_volume.h" -# include "kernel/split/kernel_queue_enqueue.h" -# include "kernel/split/kernel_indirect_background.h" -# include "kernel/split/kernel_shader_setup.h" -# include "kernel/split/kernel_shader_sort.h" -# include "kernel/split/kernel_shader_eval.h" -# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -# include "kernel/split/kernel_subsurface_scatter.h" -# include "kernel/split/kernel_direct_lighting.h" -# include "kernel/split/kernel_shadow_blocked_ao.h" -# include "kernel/split/kernel_shadow_blocked_dl.h" -# include "kernel/split/kernel_enqueue_inactive.h" -# include "kernel/split/kernel_next_iteration_setup.h" -# include "kernel/split/kernel_indirect_subsurface.h" -# include "kernel/split/kernel_buffer_update.h" -# include "kernel/split/kernel_adaptive_stopping.h" -# include "kernel/split/kernel_adaptive_filter_x.h" -# include "kernel/split/kernel_adaptive_filter_y.h" -# include "kernel/split/kernel_adaptive_adjust_samples.h" -# endif /* __SPLIT_KERNEL__ */ -#else -# define STUB_ASSERT(arch, name) \ - assert(!(#name " kernel stub for architecture " #arch " was called!")) - -# ifdef __SPLIT_KERNEL__ -# include "kernel/split/kernel_data_init.h" -# endif /* __SPLIT_KERNEL__ */ -#endif /* KERNEL_STUB */ -// clang-format on - -CCL_NAMESPACE_BEGIN - -#ifndef __SPLIT_KERNEL__ - -/* Path Tracing */ - -void KERNEL_FUNCTION_FULL_NAME(path_trace)( - KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride) -{ -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, path_trace); -# else -# ifdef __BRANCHED_PATH__ - if (kernel_data.integrator.branched) { - kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride); - } - else -# endif - { - kernel_path_trace(kg, buffer, sample, x, y, offset, stride); - } -# endif /* KERNEL_STUB */ -} - -/* Film */ - -void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, - uchar4 *rgba, - float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride) -{ -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, convert_to_byte); -# else - kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); -# endif /* KERNEL_STUB */ -} - -void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, - uchar4 *rgba, - float *buffer, - float sample_scale, - int x, - int y, - int offset, - int stride) -{ -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); -# else - kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); -# endif /* KERNEL_STUB */ -} - -/* Bake */ - -void KERNEL_FUNCTION_FULL_NAME(bake)( - KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride) -{ -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, bake); -# else -# ifdef __BAKING__ - kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride); -# endif -# endif /* KERNEL_STUB */ -} - -/* Shader Evaluate */ - -void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, - uint4 *input, - float4 *output, - int type, - int filter, - int i, - int offset, - int sample) -{ -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, shader); -# else - if (type == SHADER_EVAL_DISPLACE) { - kernel_displace_evaluate(kg, input, output, i); - } - else { - kernel_background_evaluate(kg, input, output, i); - } -# endif /* KERNEL_STUB */ -} - -#else /* __SPLIT_KERNEL__ */ - -/* Split Kernel Path Tracing */ - -# ifdef KERNEL_STUB -# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ - { \ - STUB_ASSERT(KERNEL_ARCH, name); \ - } - -# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ - { \ - STUB_ASSERT(KERNEL_ARCH, name); \ - } -# else -# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ - { \ - kernel_##name(kg); \ - } - -# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \ - { \ - ccl_local type locals; \ - kernel_##name(kg, &locals); \ - } -# endif /* KERNEL_STUB */ - -DEFINE_SPLIT_KERNEL_FUNCTION(path_init) -DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) -DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) -DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, - BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) -#endif /* __SPLIT_KERNEL__ */ - -#undef KERNEL_STUB -#undef STUB_ASSERT -#undef KERNEL_ARCH - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp deleted file mode 100644 index 989f5e5aaa8..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CPU kernel entry points */ - -/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this - * one with SSE2 intrinsics. - */ -#if defined(__x86_64__) || defined(_M_X64) -# define __KERNEL_SSE2__ -#endif - -#define __SPLIT_KERNEL__ - -/* When building kernel for native machine detect kernel features from the flags - * set by compiler. - */ -#ifdef WITH_KERNEL_NATIVE -# ifdef __SSE2__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# endif -# ifdef __SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifdef __SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifdef __SSE4_1__ -# define __KERNEL_SSE41__ -# endif -# ifdef __AVX__ -# define __KERNEL_AVX__ -# endif -# ifdef __AVX2__ -# define __KERNEL_SSE__ -# define __KERNEL_AVX2__ -# endif -#endif - -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) -/* do nothing */ -#endif - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp deleted file mode 100644 index 40e485d27c0..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with AVX - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#define __SPLIT_KERNEL__ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp deleted file mode 100644 index 8c44238470e..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2011-2014 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with AVX2 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#define __SPLIT_KERNEL__ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp deleted file mode 100644 index 7a3f218d5fc..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE2 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#define __SPLIT_KERNEL__ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp deleted file mode 100644 index 1cab59e0ea0..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#define __SPLIT_KERNEL__ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp deleted file mode 100644 index 637126d9d4c..00000000000 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 - * optimization flags and nearly all functions inlined, while kernel.cpp - * is compiled without for other CPU's. */ - -#define __SPLIT_KERNEL__ - -#include "util/util_optimization.h" - -#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_STUB -#else -/* SSE optimization disabled for now on 32 bit, see bug T36316. */ -# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# endif -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#include "kernel/kernel.h" -#define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu deleted file mode 100644 index 6c9642d1f03..00000000000 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#include "kernel_config.h" - -#include "kernel/kernel_compat_cuda.h" - -#include "kernel/filter/filter_kernel.h" - -/* kernels */ - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_copy_input(float *buffer, - CCL_FILTER_TILE_INFO, - int4 prefilter_rect, - int buffer_pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int itile = ytile * 3 + xtile; - float *const in = ((float *)ccl_get_tile_buffer(itile)) + - (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride; - buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride; - for (int i = 0; i < buffer_pass_stride; ++i) - buffer[i] = in[i]; - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - if (num_inputs > 0) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float); - float *out = rgb + (x + y * sw) * 3; - out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f); - out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f); - out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f); - } - if (num_inputs > 1) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - if (num_inputs > 2) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - float *in = rgb + ((ix + x) + (iy + y) * iw) * 3; - float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride; - out[0] = in[0] * num_samples; - out[1] = in[1] * num_samples; - out[2] = in[2] * num_samples; - } -} - - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - float *unfilteredA, - float *unfilteredB, - float *sampleVariance, - float *sampleVarianceV, - float *bufferVariance, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_divide_shadow(sample, - tile_info, - x, y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - float *mean, - float *variance, - float scale, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_get_feature(sample, - tile_info, - m_offset, v_offset, - x, y, - mean, variance, - scale, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_write_feature(int sample, - int4 buffer_params, - int4 filter_area, - float *from, - float *buffer, - int out_offset, - int4 prefilter_rect) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - kernel_filter_write_feature(sample, - x + filter_area.x, - y + filter_area.y, - buffer_params, - from, - buffer, - out_offset, - prefilter_rect); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_detect_outliers(float *image, - float *variance, - float *depth, - float *output, - int4 prefilter_rect, - int pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, - CCL_FILTER_TILE_INFO, - float *transform, int *rank, - int4 filter_area, int4 rect, - int radius, float pca_threshold, - int pass_stride, int frame_stride, - bool use_time) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int *l_rank = rank + y*filter_area.z + x; - float *l_transform = transform + y*filter_area.z + x; - kernel_filter_construct_transform(buffer, - tile_info, - x + filter_area.x, y + filter_area.y, - rect, - pass_stride, frame_stride, - use_time, - l_transform, l_rank, - radius, pca_threshold, - filter_area.z*filter_area.w, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, - const float *ccl_restrict variance_image, - const float *ccl_restrict scale_image, - float *difference_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, - weight_image, - variance_image, - scale_image, - difference_image + ofs, - rect, stride, - channel_offset, - frame_offset, - a, k_2); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_blur(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_weight(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, - const float *ccl_restrict image, - float *out_image, - float *accum_image, - int w, - int h, - int stride, - int pass_stride, - int channel_offset, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, - difference_image + ofs, - image, - out_image, - accum_image, - rect, - channel_offset, - stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *out_image, - const float *ccl_restrict accum_image, - int w, - int h, - int stride) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < w && y < h) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(int t, - const float *ccl_restrict difference_image, - const float *ccl_restrict buffer, - float const* __restrict__ transform, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_window, - int w, - int h, - int stride, - int pass_stride, - int r, - int f, - int frame_offset, - bool use_time) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { - kernel_filter_nlm_construct_gramian(co.x, co.y, - co.z, co.w, - t, - difference_image + ofs, - buffer, - transform, rank, - XtWX, XtWY, - rect, filter_window, - stride, f, - pass_stride, - frame_offset, - use_time, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_finalize(float *buffer, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_area, - int4 buffer_params, - int sample) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int storage_ofs = y*filter_area.z+x; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, - filter_area.z*filter_area.w, - XtWX, XtWY, - buffer_params, sample); - } -} - -#endif - diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu deleted file mode 100644 index cf62b6e781e..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#include "kernel/kernel_compat_cuda.h" -#include "kernel_config.h" - -#include "util/util_atomic.h" - -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/cuda/kernel_cuda_image.h" -#include "kernel/kernel_film.h" -#include "kernel/kernel_path.h" -#include "kernel/kernel_path_branched.h" -#include "kernel/kernel_bake.h" -#include "kernel/kernel_work_stealing.h" -#include "kernel/kernel_adaptive_sampling.h" - -/* kernels */ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - uint x, y, sample; - KernelGlobals kg; - if(thread_is_active) { - get_work_pixel(tile, work_index, &x, &y, &sample); - - kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - - if(kernel_data.film.cryptomatte_passes) { - __syncthreads(); - if(thread_is_active) { - kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - } -} - -#ifdef __BRANCHED_PATH__ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - uint x, y, sample; - KernelGlobals kg; - if(thread_is_active) { - get_work_pixel(tile, work_index, &x, &y, &sample); - - kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - - if(kernel_data.film.cryptomatte_passes) { - __syncthreads(); - if(thread_is_active) { - kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - } -} -#endif - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - KernelGlobals kg; - if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) { - uint x = tile->x + work_index % tile->w; - uint y = tile->y + work_index / tile->w; - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - kernel_do_adaptive_stopping(&kg, buffer, sample); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint) -{ - KernelGlobals kg; - if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { - if(ccl_global_id(0) < tile->h) { - int y = tile->y + ccl_global_id(0); - kernel_do_adaptive_filter_x(&kg, y, tile); - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint) -{ - KernelGlobals kg; - if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { - if(ccl_global_id(0) < tile->w) { - int x = tile->x + ccl_global_id(0); - kernel_do_adaptive_filter_y(&kg, x, tile); - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size) -{ - if(kernel_data.film.pass_adaptive_aux_buffer) { - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - KernelGlobals kg; - if(thread_is_active) { - uint x = tile->x + work_index % tile->w; - uint y = tile->y + work_index / tile->w; - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - if(buffer[kernel_data.film.pass_sample_count] < 0.0f) { - buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count]; - if(sample_multiplier != 1.0f) { - kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier); - } - } - else { - kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f)); - } - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) { - kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) { - kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_displace(uint4 *input, - float4 *output, - int type, - int sx, - int sw, - int offset, - int sample) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - - if(x < sx + sw) { - KernelGlobals kg; - kernel_displace_evaluate(&kg, input, output, x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_background(uint4 *input, - float4 *output, - int type, - int sx, - int sw, - int offset, - int sample) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - - if(x < sx + sw) { - KernelGlobals kg; - kernel_background_evaluate(&kg, input, output, x); - } -} - -#ifdef __BAKING__ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_bake(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - - if(work_index < total_work_size) { - uint x, y, sample; - get_work_pixel(tile, work_index, &x, &y, &sample); - - KernelGlobals kg; - kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } -} -#endif - -#endif - diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h deleted file mode 100644 index 2e47ce2de6c..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* device data taken from CUDA occupancy calculator */ - -/* 3.0 and 3.5 */ -#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.x, 6.x */ -#elif __CUDA_ARCH__ <= 699 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of - * registers */ -# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600 -# define CUDA_KERNEL_MAX_REGISTERS 64 -# else -# define CUDA_KERNEL_MAX_REGISTERS 48 -# endif -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 7.x, 8.x */ -#elif __CUDA_ARCH__ <= 899 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 64 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* For split kernel using all registers seems fastest for now, but this - * is unlikely to be optimal once we resolve other bottlenecks. */ - -#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS - -/* Compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread. */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__(threads_block_width *threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS / \ - (threads_block_width * threads_block_width * thread_num_registers)) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \ - (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \ - CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu deleted file mode 100644 index 95ad7599cf1..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA split kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#define __SPLIT_KERNEL__ - -#include "kernel/kernel_compat_cuda.h" -#include "kernel_config.h" - -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_data_init.h" -#include "kernel/split/kernel_path_init.h" -#include "kernel/split/kernel_scene_intersect.h" -#include "kernel/split/kernel_lamp_emission.h" -#include "kernel/split/kernel_do_volume.h" -#include "kernel/split/kernel_queue_enqueue.h" -#include "kernel/split/kernel_indirect_background.h" -#include "kernel/split/kernel_shader_setup.h" -#include "kernel/split/kernel_shader_sort.h" -#include "kernel/split/kernel_shader_eval.h" -#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -#include "kernel/split/kernel_subsurface_scatter.h" -#include "kernel/split/kernel_direct_lighting.h" -#include "kernel/split/kernel_shadow_blocked_ao.h" -#include "kernel/split/kernel_shadow_blocked_dl.h" -#include "kernel/split/kernel_enqueue_inactive.h" -#include "kernel/split/kernel_next_iteration_setup.h" -#include "kernel/split/kernel_indirect_subsurface.h" -#include "kernel/split/kernel_buffer_update.h" -#include "kernel/split/kernel_adaptive_stopping.h" -#include "kernel/split/kernel_adaptive_filter_x.h" -#include "kernel/split/kernel_adaptive_filter_y.h" -#include "kernel/split/kernel_adaptive_adjust_samples.h" - -#include "kernel/kernel_film.h" - -/* kernels */ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size) -{ - *size = split_data_buffer_size(NULL, num_threads); -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace_data_init( - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer) -{ - kernel_data_init(NULL, - NULL, - split_data_buffer, - num_elements, - ray_state, - start_sample, - end_sample, - sx, sy, sw, sh, offset, stride, - Queue_index, - queuesize, - use_queues_flag, - work_pool_wgs, - num_samples, - buffer); -} - -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ - kernel_cuda_##name() \ - { \ - kernel_##name(NULL); \ - } - -#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ - kernel_cuda_##name() \ - { \ - ccl_local type locals; \ - kernel_##name(NULL, &locals); \ - } - -DEFINE_SPLIT_KERNEL_FUNCTION(path_init) -DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) -DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) -DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); -} - -#endif - diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl deleted file mode 100644 index 996bc27f71b..00000000000 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* OpenCL kernel entry points */ - -#include "kernel/kernel_compat_opencl.h" - -#include "kernel/filter/filter_kernel.h" - -/* kernels */ - -__kernel void kernel_ocl_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - ccl_global float *unfilteredA, - ccl_global float *unfilteredB, - ccl_global float *sampleVariance, - ccl_global float *sampleVarianceV, - ccl_global float *bufferVariance, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + get_global_id(0); - int y = prefilter_rect.y + get_global_id(1); - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_divide_shadow(sample, - CCL_FILTER_TILE_INFO_ARG, - x, y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -__kernel void kernel_ocl_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - ccl_global float *mean, - ccl_global float *variance, - float scale, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + get_global_id(0); - int y = prefilter_rect.y + get_global_id(1); - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_get_feature(sample, - CCL_FILTER_TILE_INFO_ARG, - m_offset, v_offset, - x, y, - mean, variance, - scale, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -__kernel void kernel_ocl_filter_write_feature(int sample, - int4 buffer_params, - int4 filter_area, - ccl_global float *from, - ccl_global float *buffer, - int out_offset, - int4 prefilter_rect) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if(x < filter_area.z && y < filter_area.w) { - kernel_filter_write_feature(sample, - x + filter_area.x, - y + filter_area.y, - buffer_params, - from, - buffer, - out_offset, - prefilter_rect); - } -} - -__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, - ccl_global float *variance, - ccl_global float *depth, - ccl_global float *output, - int4 prefilter_rect, - int pass_stride) -{ - int x = prefilter_rect.x + get_global_id(0); - int y = prefilter_rect.y + get_global_id(1); - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); - } -} - -__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, - ccl_global float *variance, - ccl_global float *a, - ccl_global float *b, - int4 prefilter_rect, - int r) -{ - int x = prefilter_rect.x + get_global_id(0); - int y = prefilter_rect.y + get_global_id(1); - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); - } -} - -__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - ccl_global float *transform, - ccl_global int *rank, - int4 filter_area, - int4 rect, - int pass_stride, - int frame_stride, - char use_time, - int radius, - float pca_threshold) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if(x < filter_area.z && y < filter_area.w) { - ccl_global int *l_rank = rank + y*filter_area.z + x; - ccl_global float *l_transform = transform + y*filter_area.z + x; - kernel_filter_construct_transform(buffer, - CCL_FILTER_TILE_INFO_ARG, - x + filter_area.x, y + filter_area.y, - rect, - pass_stride, frame_stride, - use_time, - l_transform, l_rank, - radius, pca_threshold, - filter_area.z*filter_area.w, - get_local_id(1)*get_local_size(0) + get_local_id(0)); - } -} - -__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, - const ccl_global float *ccl_restrict variance_image, - const ccl_global float *ccl_restrict scale_image, - ccl_global float *difference_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, - weight_image, - variance_image, - scale_image, - difference_image + ofs, - rect, stride, - channel_offset, - frame_offset, - a, k_2); - } -} - -__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, - ccl_global float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_blur(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, - ccl_global float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_weight(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict image, - ccl_global float *out_image, - ccl_global float *accum_image, - int w, - int h, - int stride, - int pass_stride, - int channel_offset, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, - difference_image + ofs, - image, - out_image, - accum_image, - rect, - channel_offset, - stride, f); - } -} - -__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, - const ccl_global float *ccl_restrict accum_image, - int w, - int h, - int stride) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if(x < w && y < h) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); - } -} - -__kernel void kernel_ocl_filter_nlm_construct_gramian(int t, - const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 filter_window, - int w, - int h, - int stride, - int pass_stride, - int r, - int f, - int frame_offset, - char use_time) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { - kernel_filter_nlm_construct_gramian(co.x, co.y, - co.z, co.w, - t, - difference_image + ofs, - buffer, - transform, rank, - XtWX, XtWY, - rect, filter_window, - stride, f, - pass_stride, - frame_offset, - use_time, - get_local_id(1)*get_local_size(0) + get_local_id(0)); - } -} - -__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 filter_area, - int4 buffer_params, - int sample) -{ - int x = get_global_id(0); - int y = get_global_id(1); - if(x < filter_area.z && y < filter_area.w) { - int storage_ofs = y*filter_area.z+x; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, - filter_area.z*filter_area.w, - XtWX, XtWY, - buffer_params, sample); - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl deleted file mode 100644 index ebdb99d4730..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_adaptive_adjust_samples.h" - -#define KERNEL_NAME adaptive_adjust_samples -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl deleted file mode 100644 index 76d82d4184e..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_adaptive_filter_x.h" - -#define KERNEL_NAME adaptive_filter_x -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl deleted file mode 100644 index 1e6d15ba0f2..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_adaptive_filter_y.h" - -#define KERNEL_NAME adaptive_filter_y -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl deleted file mode 100644 index 51de0059667..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_adaptive_stopping.h" - -#define KERNEL_NAME adaptive_stopping -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl deleted file mode 100644 index 0e600676e82..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl +++ /dev/null @@ -1,35 +0,0 @@ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/opencl/kernel_opencl_image.h" - -#include "kernel/kernel_path.h" -#include "kernel/kernel_path_branched.h" - -#include "kernel/kernel_bake.h" - -__kernel void kernel_ocl_background( - ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, - - KERNEL_BUFFER_PARAMS, - - int type, int sx, int sw, int offset, int sample) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - - if(x < sx + sw) { - kernel_background_evaluate(kg, input, output, x); - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl deleted file mode 100644 index 7b81e387467..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl +++ /dev/null @@ -1,36 +0,0 @@ -#include "kernel/kernel_compat_opencl.h" -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/opencl/kernel_opencl_image.h" - -#include "kernel/kernel_path.h" -#include "kernel/kernel_path_branched.h" - -#include "kernel/kernel_bake.h" - -__kernel void kernel_ocl_bake( - ccl_constant KernelData *data, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - int sx, int sy, int sw, int sh, int offset, int stride, int sample) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) { -#ifndef __NO_BAKING__ - kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride); -#endif - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl deleted file mode 100644 index 1c2d89e8a92..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* OpenCL base kernels entry points */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" - -#include "kernel/kernel_film.h" - - -__kernel void kernel_ocl_convert_to_byte( - ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); -} - -__kernel void kernel_ocl_convert_to_half_float( - ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, - - KERNEL_BUFFER_PARAMS, - - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); -} - -__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) -{ - size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); - - if(i < size / sizeof(float4)) { - buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - else if(i == size / sizeof(float4)) { - ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; - - for(i = 0; i < size % sizeof(float4); i++) { - *(b++) = 0; - } - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl deleted file mode 100644 index 7125348a49f..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_data_init.h" - -__kernel void kernel_ocl_path_trace_data_init( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - KERNEL_BUFFER_PARAMS, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* size (capacity) of the queue */ - ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ - ccl_global float *buffer) -{ - kernel_data_init((KernelGlobals*)kg, - data, - split_data_buffer, - num_elements, - ray_state, - KERNEL_BUFFER_ARGS, - start_sample, - end_sample, - sx, sy, sw, sh, offset, stride, - Queue_index, - queuesize, - use_queues_flag, - work_pool_wgs, - num_samples, - buffer); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl deleted file mode 100644 index 76cc36971f5..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl +++ /dev/null @@ -1,36 +0,0 @@ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/opencl/kernel_opencl_image.h" - -#include "kernel/kernel_path.h" -#include "kernel/kernel_path_branched.h" - -#include "kernel/kernel_bake.h" - -__kernel void kernel_ocl_displace( - ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, - - KERNEL_BUFFER_PARAMS, - - int type, int sx, int sw, int offset, int sample) -{ - KernelGlobals kglobals, *kg = &kglobals; - - kg->data = data; - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); - - int x = sx + ccl_global_id(0); - - if(x < sx + sw) { - kernel_displace_evaluate(kg, input, output, x); - } -} - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl deleted file mode 100644 index 8b1332bf013..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_next_iteration_setup.h" - -#define KERNEL_NAME next_iteration_setup -#define LOCALS_TYPE unsigned int -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h deleted file mode 100644 index bb6b8a40e8e..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright 2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_NANOVDB -/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is - * safer and more portable to never use double datatype on GPU. - * Use a special structure, so that the following is true: - * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type - * (which rules out trick like using `uint64_t` as a drop-in replacement for double). - * - Padding rules are matching exactly `double` - * (which rules out array of `uint8_t`). */ -typedef struct ccl_vdb_double_t { - uint64_t i; -} ccl_vdb_double_t; - -# define double ccl_vdb_double_t -# include "nanovdb/CNanoVDB.h" -# undef double -#endif - -/* For OpenCL we do manual lookup and interpolation. */ - -ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id) -{ - const uint tex_offset = id -#define KERNEL_TEX(type, name) +1 -#include "kernel/kernel_textures.h" - ; - - return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset]; -} - -#define tex_fetch(type, info, index) \ - ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)] - -ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) -{ - x %= width; - if (x < 0) - x += width; - return x; -} - -ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) -{ - return clamp(x, 0, width - 1); -} - -ccl_device_inline float4 svm_image_texture_read( - KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z) -{ - const int data_offset = x + info->width * y + info->width * info->height * z; - const int texture_type = info->data_type; - - /* Float4 */ - if (texture_type == IMAGE_DATA_TYPE_FLOAT4) { - return tex_fetch(float4, info, data_offset); - } - /* Byte4 */ - else if (texture_type == IMAGE_DATA_TYPE_BYTE4) { - uchar4 r = tex_fetch(uchar4, info, data_offset); - float f = 1.0f / 255.0f; - return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); - } - /* Ushort4 */ - else if (texture_type == IMAGE_DATA_TYPE_USHORT4) { - ushort4 r = tex_fetch(ushort4, info, data_offset); - float f = 1.0f / 65535.f; - return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); - } - /* Float */ - else if (texture_type == IMAGE_DATA_TYPE_FLOAT) { - float f = tex_fetch(float, info, data_offset); - return make_float4(f, f, f, 1.0f); - } - /* UShort */ - else if (texture_type == IMAGE_DATA_TYPE_USHORT) { - ushort r = tex_fetch(ushort, info, data_offset); - float f = r * (1.0f / 65535.0f); - return make_float4(f, f, f, 1.0f); - } -#ifdef WITH_NANOVDB - /* NanoVDB Float */ - else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) { - cnanovdb_coord coord; - coord.mVec[0] = x; - coord.mVec[1] = y; - coord.mVec[2] = z; - float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord); - return make_float4(f, f, f, 1.0f); - } - /* NanoVDB Float3 */ - else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - cnanovdb_coord coord; - coord.mVec[0] = x; - coord.mVec[1] = y; - coord.mVec[2] = z; - cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord); - return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f); - } -#endif -#ifdef __KERNEL_CL_KHR_FP16__ - /* Half and Half4 are optional in OpenCL */ - else if (texture_type == IMAGE_DATA_TYPE_HALF) { - float f = tex_fetch(half, info, data_offset); - return make_float4(f, f, f, 1.0f); - } - else if (texture_type == IMAGE_DATA_TYPE_HALF4) { - half4 r = tex_fetch(half4, info, data_offset); - return make_float4(r.x, r.y, r.z, r.w); - } -#endif - /* Byte */ - else { - uchar r = tex_fetch(uchar, info, data_offset); - float f = r * (1.0f / 255.0f); - return make_float4(f, f, f, 1.0f); - } -} - -ccl_device_inline float4 -svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y) -{ - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - -#ifdef WITH_NANOVDB - if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { -#endif - /* Wrap */ - if (info->extension == EXTENSION_REPEAT) { - x = svm_image_texture_wrap_periodic(x, info->width); - y = svm_image_texture_wrap_periodic(y, info->height); - } - else { - x = svm_image_texture_wrap_clamp(x, info->width); - y = svm_image_texture_wrap_clamp(y, info->height); - } -#ifdef WITH_NANOVDB - } -#endif - - return svm_image_texture_read(kg, info, acc, x, y, 0); -} - -ccl_device_inline float4 -svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z) -{ - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - -#ifdef WITH_NANOVDB - if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { -#endif - /* Wrap */ - if (info->extension == EXTENSION_REPEAT) { - x = svm_image_texture_wrap_periodic(x, info->width); - y = svm_image_texture_wrap_periodic(y, info->height); - z = svm_image_texture_wrap_periodic(z, info->depth); - } - else { - x = svm_image_texture_wrap_clamp(x, info->width); - y = svm_image_texture_wrap_clamp(y, info->height); - z = svm_image_texture_wrap_clamp(z, info->depth); - } -#ifdef WITH_NANOVDB - } -#endif - - return svm_image_texture_read(kg, info, acc, x, y, z); -} - -ccl_device_inline float svm_image_texture_frac(float x, int *ix) -{ - int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); - *ix = i; - return x - (float)i; -} - -#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ - u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ - u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } \ - (void)0 - -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) -{ - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - if (info->extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - - if (info->interpolation == INTERPOLATION_CLOSEST) { - /* Closest interpolation. */ - int ix, iy; - svm_image_texture_frac(x * info->width, &ix); - svm_image_texture_frac(y * info->height, &iy); - - return svm_image_texture_read_2d(kg, id, NULL, ix, iy); - } - else if (info->interpolation == INTERPOLATION_LINEAR) { - /* Bilinear interpolation. */ - int ix, iy; - float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); - - float4 r; - r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy); - r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy); - r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1); - r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1); - return r; - } - else { - /* Bicubic interpolation. */ - int ix, iy; - float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix); - float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy); - - float u[4], v[4]; - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - - float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - for (int y = 0; y < 4; y++) { - for (int x = 0; x < 4; x++) { - float weight = u[x] * v[y]; - r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1); - } - } - return r; - } -} - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp) -{ - const ccl_global TextureInfo *info = kernel_tex_info(kg, id); - - if (info->use_transform_3d) { - Transform tfm = info->transform_3d; - P = transform_point(&tfm, P); - } - - float x = P.x; - float y = P.y; - float z = P.z; - - uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp; - -#ifdef WITH_NANOVDB - cnanovdb_readaccessor acc; - if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || - info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - ccl_global cnanovdb_griddata *grid = - (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data); - cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid))); - } - else { - if (info->extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - - x *= info->width; - y *= info->height; - z *= info->depth; - } -# define NANOVDB_ACCESS_POINTER &acc -#else -# define NANOVDB_ACCESS_POINTER NULL -#endif - - if (interpolation == INTERPOLATION_CLOSEST) { - /* Closest interpolation. */ - int ix, iy, iz; - svm_image_texture_frac(x, &ix); - svm_image_texture_frac(y, &iy); - svm_image_texture_frac(z, &iz); - - return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz); - } - else if (interpolation == INTERPOLATION_LINEAR) { - /* Trilinear interpolation. */ - int ix, iy, iz; - float tx = svm_image_texture_frac(x - 0.5f, &ix); - float ty = svm_image_texture_frac(y - 0.5f, &iy); - float tz = svm_image_texture_frac(z - 0.5f, &iz); - - float4 r; - r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz); - r += (1.0f - tz) * (1.0f - ty) * tx * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz); - r += (1.0f - tz) * ty * (1.0f - tx) * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz); - r += (1.0f - tz) * ty * tx * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz); - - r += tz * (1.0f - ty) * (1.0f - tx) * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1); - r += tz * (1.0f - ty) * tx * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1); - r += tz * ty * (1.0f - tx) * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1); - r += tz * ty * tx * - svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1); - return r; - } - else { - /* Tricubic interpolation. */ - int ix, iy, iz; - float tx = svm_image_texture_frac(x - 0.5f, &ix); - float ty = svm_image_texture_frac(y - 0.5f, &iy); - float tz = svm_image_texture_frac(z - 0.5f, &iz); - - float u[4], v[4], w[4]; - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); - - float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - for (int z = 0; z < 4; z++) { - for (int y = 0; y < 4; y++) { - for (int x = 0; x < 4; x++) { - float weight = u[x] * v[y] * w[z]; - r += weight * svm_image_texture_read_3d( - kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1); - } - } - } - return r; - } -#undef NANOVDB_ACCESS_POINTER -} - -#undef SET_CUBIC_SPLINE_WEIGHTS diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl deleted file mode 100644 index 68ee6f1d536..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_queue_enqueue.h" - -#define KERNEL_NAME queue_enqueue -#define LOCALS_TYPE QueueEnqueueLocals -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl deleted file mode 100644 index 10d09377ba9..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_scene_intersect.h" - -#define KERNEL_NAME scene_intersect -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl deleted file mode 100644 index 40eaa561863..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_shader_eval.h" - -#define KERNEL_NAME shader_eval -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl deleted file mode 100644 index 8c36100f762..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_shader_setup.h" - -#define KERNEL_NAME shader_setup -#define LOCALS_TYPE unsigned int -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl deleted file mode 100644 index bcacaa4a054..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_shader_sort.h" - -__attribute__((reqd_work_group_size(64, 1, 1))) -#define KERNEL_NAME shader_sort -#define LOCALS_TYPE ShaderSortLocals -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME -#undef LOCALS_TYPE - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl deleted file mode 100644 index 8de250a375c..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_shadow_blocked_ao.h" - -#define KERNEL_NAME shadow_blocked_ao -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl deleted file mode 100644 index 29da77022ed..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_shadow_blocked_dl.h" - -#define KERNEL_NAME shadow_blocked_dl -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME - diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl deleted file mode 100644 index c3b7b09460a..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" // PRECOMPILED -#include "kernel/split/kernel_split_common.h" // PRECOMPILED - -#include "kernel/kernels/opencl/kernel_data_init.cl" -#include "kernel/kernels/opencl/kernel_path_init.cl" -#include "kernel/kernels/opencl/kernel_state_buffer_size.cl" -#include "kernel/kernels/opencl/kernel_scene_intersect.cl" -#include "kernel/kernels/opencl/kernel_queue_enqueue.cl" -#include "kernel/kernels/opencl/kernel_shader_setup.cl" -#include "kernel/kernels/opencl/kernel_shader_sort.cl" -#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" -#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" -#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" -#include "kernel/kernels/opencl/kernel_buffer_update.cl" -#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl" -#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl" -#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl" -#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h deleted file mode 100644 index e123b4cd6ec..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define KERNEL_NAME_JOIN(a, b) a##_##b -#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) - -__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, - KERNEL_NAME)(ccl_global char *kg_global, - ccl_constant KernelData *data, - - ccl_global void *split_data_buffer, - ccl_global char *ray_state, - - KERNEL_BUFFER_PARAMS, - - ccl_global int *queue_index, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pools, - ccl_global float *buffer) -{ -#ifdef LOCALS_TYPE - ccl_local LOCALS_TYPE locals; -#endif - - KernelGlobals *kg = (KernelGlobals *)kg_global; - - if (ccl_local_id(0) + ccl_local_id(1) == 0) { - kg->data = data; - - kernel_split_params.queue_index = queue_index; - kernel_split_params.use_queues_flag = use_queues_flag; - kernel_split_params.work_pools = work_pools; - kernel_split_params.tile.buffer = buffer; - - split_data_init(kg, - &kernel_split_state, - ccl_global_size(0) * ccl_global_size(1), - split_data_buffer, - ray_state); - } - - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - - KERNEL_NAME_EVAL(kernel, KERNEL_NAME) - (kg -#ifdef LOCALS_TYPE - , - &locals -#endif - ); -} - -#undef KERNEL_NAME_JOIN -#undef KERNEL_NAME_EVAL diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl deleted file mode 100644 index 2b3be38df84..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel/kernel_compat_opencl.h" -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_subsurface_scatter.h" - -#define KERNEL_NAME subsurface_scatter -#include "kernel/kernels/opencl/kernel_split_function.h" -#undef KERNEL_NAME - diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp index 3f9de5ab33d..8e497986dcc 100644 --- a/intern/cycles/kernel/osl/background.cpp +++ b/intern/cycles/kernel/osl/background.cpp @@ -37,7 +37,7 @@ #include "kernel/osl/osl_closures.h" // clang-format off -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" #include "kernel/closure/alloc.h" #include "kernel/closure/emissive.h" // clang-format on diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index 76a2e41abfa..a2f9d3f759a 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -34,7 +34,7 @@ #include <OSL/genclosure.h> -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" #include "kernel/osl/osl_closures.h" // clang-format off diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index b78dc8a3a67..812c3b6e71b 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -34,7 +34,7 @@ #include <OSL/genclosure.h> -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" #include "kernel/osl/osl_closures.h" // clang-format off diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp index d656723bac2..80dfbee879e 100644 --- a/intern/cycles/kernel/osl/emissive.cpp +++ b/intern/cycles/kernel/osl/emissive.cpp @@ -37,7 +37,7 @@ #include "kernel/osl/osl_closures.h" // clang-format off -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" #include "kernel/kernel_types.h" #include "kernel/closure/alloc.h" #include "kernel/closure/emissive.h" diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index c5ca8616fbd..5d968ed85e0 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -32,7 +32,7 @@ #include <OSL/genclosure.h> -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" #include "kernel/osl/osl_closures.h" // clang-format off @@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN using namespace OSL; -static ustring u_cubic("cubic"); -static ustring u_gaussian("gaussian"); -static ustring u_burley("burley"); -static ustring u_principled("principled"); +static ustring u_random_walk_fixed_radius("random_walk_fixed_radius"); static ustring u_random_walk("random_walk"); -static ustring u_principled_random_walk("principled_random_walk"); class CBSSRDFClosure : public CClosurePrimitive { public: Bssrdf params; + float ior; ustring method; CBSSRDFClosure() { - params.texture_blur = 0.0f; - params.sharpness = 0.0f; - params.roughness = 0.0f; + params.roughness = FLT_MAX; + params.anisotropy = 1.0f; + ior = 1.4f; } void setup(ShaderData *sd, int path_flag, float3 weight) { - if (method == u_cubic) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID); - } - else if (method == u_gaussian) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID); - } - else if (method == u_burley) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID); - } - else if (method == u_principled) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); + if (method == u_random_walk_fixed_radius) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID); } else if (method == u_random_walk) { alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID); } - else if (method == u_principled_random_walk) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID); - } } void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type) @@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive { /* create one closure per color channel */ bssrdf->radius = params.radius; bssrdf->albedo = params.albedo; - bssrdf->texture_blur = params.texture_blur; - bssrdf->sharpness = params.sharpness; bssrdf->N = params.N; bssrdf->roughness = params.roughness; - sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type); + bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f); + sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f)); } } }; @@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params() CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N), CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius), CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo), - CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"), - CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"), CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"), + CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"), + CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"), CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"), CLOSURE_FINISH_PARAM(CBSSRDFClosure)}; return params; diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 7ee467a46dd..e814fcca246 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -40,10 +40,10 @@ #include "util/util_param.h" // clang-format off +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + #include "kernel/kernel_types.h" -#include "kernel/kernel_compat_cpu.h" -#include "kernel/split/kernel_split_data_types.h" -#include "kernel/kernel_globals.h" #include "kernel/kernel_montecarlo.h" #include "kernel/kernel_random.h" @@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) { /* caustic options */ if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) { - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) || (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) { diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 2b7c21d0bc4..396f42080e4 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -40,22 +40,22 @@ #include "util/util_string.h" // clang-format off -#include "kernel/kernel_compat_cpu.h" -#include "kernel/split/kernel_split_data_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernel_random.h" -#include "kernel/kernel_write_passes.h" -#include "kernel/kernel_projection.h" +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" +#include "kernel/device/cpu/image.h" + #include "kernel/kernel_differential.h" -#include "kernel/kernel_montecarlo.h" -#include "kernel/kernel_camera.h" -#include "kernel/kernels/cpu/kernel_cpu_image.h" + +#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/integrator_state_flow.h" + #include "kernel/geom/geom.h" #include "kernel/bvh/bvh.h" +#include "kernel/kernel_color.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernel_path_state.h" #include "kernel/kernel_projection.h" -#include "kernel/kernel_accumulate.h" #include "kernel/kernel_shader.h" // clang-format on @@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, * a concept of shader space, so we just use object space for both. */ if (xform) { const ShaderData *sd = (const ShaderData *)xform; - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; int object = sd->object; if (object != OBJECT_NONE) { @@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, Transform tfm; if (time == sd->time) - tfm = sd->ob_tfm; + tfm = object_get_transform(kg, sd); else tfm = object_fetch_transform_motion_test(kg, object, time, NULL); #else - Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); + const Transform tfm = object_get_transform(kg, sd); #endif copy_matrix(result, tfm); return true; } else if (sd->type == PRIMITIVE_LAMP) { - copy_matrix(result, sd->ob_tfm); + const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false); + copy_matrix(result, tfm); return true; } @@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, * a concept of shader space, so we just use object space for both. */ if (xform) { const ShaderData *sd = (const ShaderData *)xform; - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; int object = sd->object; if (object != OBJECT_NONE) { @@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, Transform itfm; if (time == sd->time) - itfm = sd->ob_itfm; + itfm = object_get_inverse_transform(kg, sd); else object_fetch_transform_motion_test(kg, object, time, &itfm); #else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); + const Transform itfm = object_get_inverse_transform(kg, sd); #endif copy_matrix(result, itfm); return true; } else if (sd->type == PRIMITIVE_LAMP) { - copy_matrix(result, sd->ob_itfm); + const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true); + copy_matrix(result, itfm); return true; } @@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, float time) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if (from == u_ndc) { copy_matrix(result, kernel_data.cam.ndctoworld); @@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, float time) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if (to == u_ndc) { copy_matrix(result, kernel_data.cam.worldtondc); @@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, * a concept of shader space, so we just use object space for both. */ if (xform) { const ShaderData *sd = (const ShaderData *)xform; + const KernelGlobals *kg = sd->osl_globals; int object = sd->object; if (object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - KernelGlobals *kg = sd->osl_globals; - Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); -#endif + const Transform tfm = object_get_transform(kg, sd); copy_matrix(result, tfm); return true; } else if (sd->type == PRIMITIVE_LAMP) { - copy_matrix(result, sd->ob_tfm); + const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false); + copy_matrix(result, tfm); return true; } @@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, * a concept of shader space, so we just use object space for both. */ if (xform) { const ShaderData *sd = (const ShaderData *)xform; + const KernelGlobals *kg = sd->osl_globals; int object = sd->object; if (object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - KernelGlobals *kg = sd->osl_globals; - Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); -#endif + const Transform tfm = object_get_inverse_transform(kg, sd); copy_matrix(result, tfm); return true; } else if (sd->type == PRIMITIVE_LAMP) { - copy_matrix(result, sd->ob_itfm); + const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true); + copy_matrix(result, itfm); return true; } @@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if (from == u_ndc) { copy_matrix(result, kernel_data.cam.ndctoworld); @@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, ustring to) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if (to == u_ndc) { copy_matrix(result, kernel_data.cam.worldtondc); @@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val) return false; } -static bool get_primitive_attribute(KernelGlobals *kg, +static bool get_primitive_attribute(const KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute &attr, const TypeDesc &type, @@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg, } } -static bool get_mesh_attribute(KernelGlobals *kg, +static bool get_mesh_attribute(const KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute &attr, const TypeDesc &type, @@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr, } } -bool OSLRenderServices::get_object_standard_attribute( - KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val) +bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg, + ShaderData *sd, + ustring name, + TypeDesc type, + bool derivatives, + void *val) { /* todo: turn this into hash table? */ @@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute( return false; } -bool OSLRenderServices::get_background_attribute( - KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val) +bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg, + ShaderData *sd, + ustring name, + TypeDesc type, + bool derivatives, + void *val) { if (name == u_path_ray_length) { /* Ray Length */ @@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute( } else if (name == u_path_ray_depth) { /* Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->bounce; + const IntegratorStateCPU *state = sd->osl_path_state; + int f = state->path.bounce; return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_diffuse_depth) { /* Diffuse Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->diffuse_bounce; + const IntegratorStateCPU *state = sd->osl_path_state; + int f = state->path.diffuse_bounce; return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_glossy_depth) { /* Glossy Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->glossy_bounce; + const IntegratorStateCPU *state = sd->osl_path_state; + int f = state->path.glossy_bounce; return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_transmission_depth) { /* Transmission Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->transmission_bounce; + const IntegratorStateCPU *state = sd->osl_path_state; + int f = state->path.transmission_bounce; return set_attribute_int(f, type, derivatives, val); } else if (name == u_path_transparent_depth) { /* Transparent Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->transparent_bounce; - return set_attribute_int(f, type, derivatives, val); - } - else if (name == u_path_transmission_depth) { - /* Transmission Ray Depth */ - PathState *state = sd->osl_path_state; - int f = state->transmission_bounce; + const IntegratorStateCPU *state = sd->osl_path_state; + int f = state->path.transparent_bounce; return set_attribute_int(f, type, derivatives, val); } else if (name == u_ndc) { @@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute( ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P); if (derivatives) { - ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0]; - ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0]; + ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) - + ndc[0]; + ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) - + ndc[0]; } } else { @@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool OSLRenderServices::get_attribute( ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; int prim_type = 0; int object; @@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename, OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle; OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO; ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kernel_globals = sd->osl_globals; + const KernelGlobals *kernel_globals = sd->osl_globals; bool status = false; switch (texture_type) { case OSLTextureHandle::BEVEL: { /* Bevel shader hack. */ if (nchannels >= 3) { - PathState *state = sd->osl_path_state; + const IntegratorStateCPU *state = sd->osl_path_state; int num_samples = (int)s; float radius = t; - float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples); + float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples); result[0] = N.x; result[1] = N.y; result[2] = N.z; @@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename, } case OSLTextureHandle::AO: { /* AO shader hack. */ - PathState *state = sd->osl_path_state; + const IntegratorStateCPU *state = sd->osl_path_state; int num_samples = (int)s; float radius = t; float3 N = make_float3(dsdx, dtdx, dsdy); @@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename, if ((int)options.tblur) { flags |= NODE_AO_GLOBAL_RADIUS; } - result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags); + result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags); status = true; break; } @@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename, case OSLTextureHandle::SVM: { /* Packed texture. */ ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kernel_globals = sd->osl_globals; + const KernelGlobals *kernel_globals = sd->osl_globals; int slot = handle->svm_slot; float3 P_float3 = make_float3(P.x, P.y, P.z); float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE); @@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename, if (handle && handle->oiio_handle) { if (texture_thread_info == NULL) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kernel_globals = sd->osl_globals; + const KernelGlobals *kernel_globals = sd->osl_globals; OSLThreadData *tdata = kernel_globals->osl_tdata; texture_thread_info = tdata->oiio_thread_info; } @@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename, if (handle && handle->oiio_handle) { if (thread_info == NULL) { ShaderData *sd = (ShaderData *)(sg->renderstate); - KernelGlobals *kernel_globals = sd->osl_globals; + const KernelGlobals *kernel_globals = sd->osl_globals; OSLThreadData *tdata = kernel_globals->osl_tdata; thread_info = tdata->oiio_thread_info; } @@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options, } /* ray differentials */ - ray.dP.dx = TO_FLOAT3(dPdx); - ray.dP.dy = TO_FLOAT3(dPdy); - ray.dD.dx = TO_FLOAT3(dRdx); - ray.dD.dy = TO_FLOAT3(dRdy); + differential3 dP; + dP.dx = TO_FLOAT3(dPdx); + dP.dy = TO_FLOAT3(dPdy); + ray.dP = differential_make_compact(dP); + differential3 dD; + dD.dx = TO_FLOAT3(dRdx); + dD.dy = TO_FLOAT3(dRdy); + ray.dD = differential_make_compact(dD); /* allocate trace data */ OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata; @@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options, tracedata->hit = false; tracedata->sd.osl_globals = sd->osl_globals; - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; /* Can't raytrace from shaders like displacement, before BVH exists. */ if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) { @@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg, } else { ShaderData *sd = &tracedata->sd; - KernelGlobals *kg = sd->osl_globals; + const KernelGlobals *kg = sd->osl_globals; if (!tracedata->setup) { /* lazy shader data setup */ - shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray); + shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect); tracedata->setup = true; } diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 891b9172dd4..58accb46e7d 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices { void *data) override; #endif - static bool get_background_attribute( - KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val); - static bool get_object_standard_attribute( - KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val); + static bool get_background_attribute(const KernelGlobals *kg, + ShaderData *sd, + ustring name, + TypeDesc type, + bool derivatives, + void *val); + static bool get_object_standard_attribute(const KernelGlobals *kg, + ShaderData *sd, + ustring name, + TypeDesc type, + bool derivatives, + void *val); static ustring u_distance; static ustring u_index; diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 389c854c495..880ef635c76 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -17,14 +17,16 @@ #include <OSL/oslexec.h> // clang-format off -#include "kernel/kernel_compat_cpu.h" +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + #include "kernel/kernel_montecarlo.h" #include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data_types.h" -#include "kernel/kernel_globals.h" #include "kernel/geom/geom_object.h" +#include "kernel/integrator/integrator_state.h" + #include "kernel/osl/osl_closures.h" #include "kernel/osl/osl_globals.h" #include "kernel/osl/osl_services.h" @@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN /* Threads */ -void OSLShader::thread_init(KernelGlobals *kg, - KernelGlobals *kernel_globals, - OSLGlobals *osl_globals) +void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals) { /* no osl used? */ if (!osl_globals->use) { @@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg) /* Globals */ -static void shaderdata_to_shaderglobals( - KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata) +static void shaderdata_to_shaderglobals(const KernelGlobals *kg, + ShaderData *sd, + const IntegratorStateCPU *state, + int path_flag, + OSLThreadData *tdata) { OSL::ShaderGlobals *globals = &tdata->globals; @@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd, } } -void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) +void OSLShader::eval_surface(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd, } } -void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) +void OSLShader::eval_background(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } -void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) +void OSLShader::eval_volume(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state) +void OSLShader::eval_displacement(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState * /* Attributes */ -int OSLShader::find_attribute(KernelGlobals *kg, +int OSLShader::find_attribute(const KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc) diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index a4fa24d0a90..f1f17b141eb 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -37,6 +37,7 @@ class Scene; struct ShaderClosure; struct ShaderData; +struct IntegratorStateCPU; struct differential3; struct KernelGlobals; @@ -49,19 +50,28 @@ class OSLShader { static void register_closures(OSLShadingSystem *ss); /* per thread data */ - static void thread_init(KernelGlobals *kg, - KernelGlobals *kernel_globals, - OSLGlobals *osl_globals); + static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals); static void thread_free(KernelGlobals *kg); /* eval */ - static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); - static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); - static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state); + static void eval_surface(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag); + static void eval_background(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag); + static void eval_volume(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd, + int path_flag); + static void eval_displacement(const KernelGlobals *kg, + const IntegratorStateCPU *state, + ShaderData *sd); /* attributes */ - static int find_attribute(KernelGlobals *kg, + static int find_attribute(const KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl index 23949f406c7..55afb892d36 100644 --- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -18,11 +18,13 @@ #include "stdcycles.h" shader node_principled_bsdf(string distribution = "Multiscatter GGX", - string subsurface_method = "burley", + string subsurface_method = "random_walk", color BaseColor = color(0.8, 0.8, 0.8), float Subsurface = 0.0, vector SubsurfaceRadius = vector(1.0, 1.0, 1.0), color SubsurfaceColor = color(0.7, 0.1, 0.1), + float SubsurfaceIOR = 1.4, + float SubsurfaceAnisotropy = 0.0, float Metallic = 0.0, float Specular = 0.5, float SpecularTint = 0.0, @@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX", if (diffuse_weight > 1e-5) { if (Subsurface > 1e-5) { color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); - if (subsurface_method == "burley") { - BSDF = mixed_ss_base_color * bssrdf("principled", - Normal, - Subsurface * SubsurfaceRadius, - SubsurfaceColor, - "roughness", - Roughness); - } - else { - BSDF = mixed_ss_base_color * bssrdf("principled_random_walk", - Normal, - Subsurface * SubsurfaceRadius, - mixed_ss_base_color, - "roughness", - Roughness); - } + + BSDF = mixed_ss_base_color * bssrdf(subsurface_method, + Normal, + Subsurface * SubsurfaceRadius, + mixed_ss_base_color, + "roughness", + Roughness, + "ior", + SubsurfaceIOR, + "anisotropy", + SubsurfaceAnisotropy); } else { BSDF = BaseColor * principled_diffuse(Normal, Roughness); diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl index b1e854150ab..f55e38c54ff 100644 --- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl +++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl @@ -19,27 +19,12 @@ shader node_subsurface_scattering(color Color = 0.8, float Scale = 1.0, vector Radius = vector(0.1, 0.1, 0.1), - float TextureBlur = 0.0, - float Sharpness = 0.0, - string falloff = "cubic", + float IOR = 1.4, + float Anisotropy = 0.0, + string method = "random_walk", normal Normal = N, output closure color BSSRDF = 0) { - if (falloff == "gaussian") - BSSRDF = Color * - bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); - else if (falloff == "cubic") - BSSRDF = Color * bssrdf("cubic", - Normal, - Scale * Radius, - Color, - "texture_blur", - TextureBlur, - "sharpness", - Sharpness); - else if (falloff == "burley") - BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); - else - BSSRDF = Color * - bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); + BSSRDF = Color * + bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy); } diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h deleted file mode 100644 index 437a5c9581b..00000000000 --- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg) -{ - int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) { - int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; - int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; - int buffer_offset = (kernel_split_params.tile.offset + x + - y * kernel_split_params.tile.stride) * - kernel_data.film.pass_stride; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples; - if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { - buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count]; - if (sample_multiplier != 1.0f) { - kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); - } - } - else { - kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f)); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h deleted file mode 100644 index 93f41f7ced4..00000000000 --- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg) -{ - int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (pixel_index < kernel_split_params.tile.h && - kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= - kernel_data.integrator.adaptive_min_samples) { - int y = kernel_split_params.tile.y + pixel_index; - kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h deleted file mode 100644 index eca53d079ec..00000000000 --- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg) -{ - int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (pixel_index < kernel_split_params.tile.w && - kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= - kernel_data.integrator.adaptive_min_samples) { - int x = kernel_split_params.tile.x + pixel_index; - kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile); - } -} -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h deleted file mode 100644 index c8eb1ebd705..00000000000 --- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2019 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_adaptive_stopping(KernelGlobals *kg) -{ - int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h && - kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= - kernel_data.integrator.adaptive_min_samples) { - int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; - int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; - int buffer_offset = (kernel_split_params.tile.offset + x + - y * kernel_split_params.tile.stride) * - kernel_data.film.pass_stride; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - kernel_do_adaptive_stopping(kg, - buffer, - kernel_split_params.tile.start_sample + - kernel_split_params.tile.num_samples - 1); - } -} -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h deleted file mode 100644 index 45f5037d321..00000000000 --- a/intern/cycles/kernel/split/kernel_branched.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#ifdef __BRANCHED_PATH__ - -/* sets up the various state needed to do an indirect loop */ -ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, - int ray_index) -{ - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - /* save a copy of the state to restore later */ -# define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index]; - - BRANCHED_STORE(path_state); - BRANCHED_STORE(throughput); - BRANCHED_STORE(ray); - BRANCHED_STORE(isect); - BRANCHED_STORE(ray_state); - - *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index); - for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) { - kernel_split_sd(branched_state_sd, ray_index)->closure[i] = - kernel_split_sd(sd, ray_index)->closure[i]; - } - -# undef BRANCHED_STORE - - /* Set loop counters to initial position. */ - branched_state->next_closure = 0; - branched_state->next_sample = 0; -} - -/* ends an indirect loop and restores the previous state */ -ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, - int ray_index) -{ - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - /* restore state */ -# define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name; - - BRANCHED_RESTORE(path_state); - BRANCHED_RESTORE(throughput); - BRANCHED_RESTORE(ray); - BRANCHED_RESTORE(isect); - BRANCHED_RESTORE(ray_state); - - *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index); - for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) { - kernel_split_sd(sd, ray_index)->closure[i] = - kernel_split_sd(branched_state_sd, ray_index)->closure[i]; - } - -# undef BRANCHED_RESTORE - - /* leave indirect loop */ - REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); -} - -ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, - int ray_index) -{ - ccl_global char *ray_state = kernel_split_state.ray_state; - - int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - kernel_split_params.queue_index); - - if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { - return false; - } - -# define SPLIT_DATA_ENTRY(type, name, num) \ - if (num) { \ - kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \ - } - SPLIT_DATA_ENTRIES_BRANCHED_SHARED -# undef SPLIT_DATA_ENTRY - - *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index); - for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) { - kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i]; - } - - kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; - kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; - kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; - - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; - - path_radiance_init(kg, inactive_L); - path_radiance_copy_indirect(inactive_L, L); - - ray_state[inactive_ray] = RAY_REGENERATED; - ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); - ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); - - atomic_fetch_and_inc_uint32( - (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count); - - return true; -} - -/* bounce off surface and integrate indirect light */ -ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( - KernelGlobals *kg, - int ray_index, - float num_samples_adjust, - ShaderData *saved_sd, - bool reset_path_state, - bool wait_for_shared) -{ - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - ShaderData *sd = saved_sd; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - float3 throughput = branched_state->throughput; - ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; - - float sum_sample_weight = 0.0f; -# ifdef __DENOISING_FEATURES__ - if (ps->denoising_feature_weight > 0.0f) { - for (int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - /* transparency is not handled here, but in outer loop */ - if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { - continue; - } - - sum_sample_weight += sc->sample_weight; - } - } - else { - sum_sample_weight = 1.0f; - } -# endif /* __DENOISING_FEATURES__ */ - - for (int i = branched_state->next_closure; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if (!CLOSURE_IS_BSDF(sc->type)) - continue; - /* transparency is not handled here, but in outer loop */ - if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) - continue; - - int num_samples; - - if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) - num_samples = kernel_data.integrator.diffuse_samples; - else if (CLOSURE_IS_BSDF_BSSRDF(sc->type)) - num_samples = 1; - else if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) - num_samples = kernel_data.integrator.glossy_samples; - else - num_samples = kernel_data.integrator.transmission_samples; - - num_samples = ceil_to_int(num_samples_adjust * num_samples); - - float num_samples_inv = num_samples_adjust / num_samples; - - for (int j = branched_state->next_sample; j < num_samples; j++) { - if (reset_path_state) { - *ps = branched_state->path_state; - } - - ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); - - ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; - *tp = throughput; - - ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; - - if (!kernel_branched_path_surface_bounce( - kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) { - continue; - } - - ps->rng_hash = branched_state->path_state.rng_hash; - - /* update state for next iteration */ - branched_state->next_closure = i; - branched_state->next_sample = j + 1; - - /* start the indirect path */ - *tp *= num_samples_inv; - - if (kernel_split_branched_indirect_start_shared(kg, ray_index)) { - continue; - } - - return true; - } - - branched_state->next_sample = 0; - } - - branched_state->next_closure = sd->num_closure; - - if (wait_for_shared) { - branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); - if (branched_state->waiting_on_shared_samples) { - return true; - } - } - - return false; -} - -#endif /* __BRANCHED_PATH__ */ - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h deleted file mode 100644 index b96feca582f..00000000000 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel takes care of rays that hit the background (sceneintersect - * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's - * accumulated radiance in the output buffer. This kernel also takes care of - * rays that have been determined to-be-regenerated. - * - * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel. - * - * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER - * will be eventually set to RAY_TO_REGENERATE state in this kernel. - * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put - * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * State of queues when this kernel is called: - * At entry, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays. - * At exit, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and - * RAY_REGENERATED rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - */ -ccl_device void kernel_buffer_update(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) -{ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (ray_index == 0) { - /* We will empty this queue in this kernel. */ - kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - char enqueue_flag = 0; - ray_index = get_ray_index(kg, - ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - - if (ray_index != QUEUE_EMPTY_SLOT) { - ccl_global char *ray_state = kernel_split_state.ray_state; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - bool ray_was_updated = false; - - if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - ray_was_updated = true; - uint sample = state->sample; - uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - - /* accumulate result in output buffer */ - kernel_write_result(kg, buffer, sample, L); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - - if (kernel_data.film.cryptomatte_passes) { - /* Make sure no thread is writing to the buffers. */ - ccl_barrier(CCL_LOCAL_MEM_FENCE); - if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) { - uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte; - kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth); - } - } - - if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - /* We have completed current work; So get next work */ - ccl_global uint *work_pools = kernel_split_params.work_pools; - uint total_work_size = kernel_split_params.total_work_size; - uint work_index; - - if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { - /* If work is invalid, this means no more work is available and the thread may exit */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } - - if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - ccl_global WorkTile *tile = &kernel_split_params.tile; - uint x, y, sample; - get_work_pixel(tile, work_index, &x, &y, &sample); - - /* Store buffer offset for writing to passes. */ - uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; - kernel_split_state.buffer_offset[ray_index] = buffer_offset; - - /* Initialize random numbers and ray. */ - uint rng_hash; - kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray); - - if (ray->t != 0.0f) { - /* Initialize throughput, path radiance, Ray, PathState; - * These rays proceed with path-iteration. - */ - *throughput = make_float3(1.0f, 1.0f, 1.0f); - path_radiance_init(kg, L); - path_state_init(kg, - AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), - state, - rng_hash, - sample, - ray); -#ifdef __SUBSURFACE__ - kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - enqueue_flag = 1; - } - else { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - } - } - } - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; - * These rays will be made active during next SceneIntersectkernel. - */ - enqueue_ray_index_local(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h deleted file mode 100644 index 2f83a10316d..00000000000 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel Initializes structures needed in path-iteration kernels. - * - * Note on Queues: - * All slots in queues are initialized to queue empty slot; - * The number of elements in the queues is initialized to 0; - */ - -#ifndef __KERNEL_CPU__ -ccl_device void kernel_data_init( -#else -void KERNEL_FUNCTION_FULL_NAME(data_init)( -#endif - KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - -#ifdef __KERNEL_OPENCL__ - KERNEL_BUFFER_PARAMS, -#endif - - int start_sample, - int end_sample, - int sx, - int sy, - int sw, - int sh, - int offset, - int stride, - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* size (capacity) of the queue */ - ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues - to fetch ray index */ - ccl_global unsigned int *work_pools, /* Work pool for each work group */ - unsigned int num_samples, - ccl_global float *buffer) -{ -#ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, data_init); -#else - -# ifdef __KERNEL_OPENCL__ - kg->data = data; -# endif - - kernel_split_params.tile.x = sx; - kernel_split_params.tile.y = sy; - kernel_split_params.tile.w = sw; - kernel_split_params.tile.h = sh; - - kernel_split_params.tile.start_sample = start_sample; - kernel_split_params.tile.num_samples = num_samples; - - kernel_split_params.tile.offset = offset; - kernel_split_params.tile.stride = stride; - - kernel_split_params.tile.buffer = buffer; - - kernel_split_params.total_work_size = sw * sh * num_samples; - - kernel_split_params.work_pools = work_pools; - - kernel_split_params.queue_index = Queue_index; - kernel_split_params.queue_size = queuesize; - kernel_split_params.use_queues_flag = use_queues_flag; - - split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); - -# ifdef __KERNEL_OPENCL__ - kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); - kernel_set_buffer_info(kg); -# endif - - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - - /* Initialize queue data and queue index. */ - if (thread_index < queuesize) { - for (int i = 0; i < NUM_QUEUES; i++) { - kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - } - } - - if (thread_index == 0) { - for (int i = 0; i < NUM_QUEUES; i++) { - Queue_index[i] = 0; - } - - /* The scene-intersect kernel should not use the queues very first time. - * since the queue would be empty. - */ - *use_queues_flag = 0; - } -#endif /* KERENL_STUB */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h deleted file mode 100644 index 3be2b35812f..00000000000 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel takes care of direct lighting logic. - * However, the "shadow ray cast" part of direct lighting is handled - * in the next kernel. - * - * This kernels determines the rays for which a shadow_blocked() function - * associated with direct lighting should be executed. Those rays for which - * a shadow_blocked() function for direct-lighting must be executed, are - * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue - * QUEUE_SHADOW_RAY_CAST_DL_RAYS - * - * Note on Queues: - * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue - * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute - * the corresponding shadow_blocked part, after direct lighting, the ray is - * marked with RAY_SHADOW_RAY_CAST_DL flag. - * - * State of queues when this kernel is called: - * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this - * kernel call. - * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a - * shadow_blocked function must be executed, after this kernel call - * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. - */ -ccl_device void kernel_direct_lighting(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) -{ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - - /* direct lighting */ -#ifdef __EMISSION__ - bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); - -# ifdef __BRANCHED_PATH__ - if (flag && kernel_data.integrator.branched) { - flag = false; - enqueue_flag = 1; - } -# endif /* __BRANCHED_PATH__ */ - -# ifdef __SHADOW_TRICKS__ - if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) { - flag = false; - enqueue_flag = 1; - } -# endif /* __SHADOW_TRICKS__ */ - - if (flag) { - /* Sample illumination from lights to find path contribution. */ - float light_u, light_v; - path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, state); - - LightSample ls; - if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - Ray light_ray; - light_ray.time = sd->time; - - BsdfEval L_light; - bool is_lamp; - if (direct_emission(kg, - sd, - AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), - &ls, - state, - &light_ray, - &L_light, - &is_lamp, - terminate)) { - /* Write intermediate data to global memory to access from - * the next kernel. - */ - kernel_split_state.light_ray[ray_index] = light_ray; - kernel_split_state.bsdf_eval[ray_index] = L_light; - kernel_split_state.is_lamp[ray_index] = is_lamp; - /* Mark ray state for next shadow kernel. */ - enqueue_flag = 1; - } - } - } -#endif /* __EMISSION__ */ - } - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); -#endif - -#ifdef __BRANCHED_PATH__ - /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays - * this is the last kernel before next_iteration_setup that uses local atomics so we do this here - */ - ccl_barrier(CCL_LOCAL_MEM_FENCE); - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - enqueue_ray_index_local( - ray_index, - QUEUE_LIGHT_INDIRECT_ITER, - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - -#endif /* __BRANCHED_PATH__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h deleted file mode 100644 index 1775e870f07..00000000000 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) - -ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, - int ray_index) -{ - kernel_split_branched_path_indirect_loop_init(kg, ray_index); - - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); -} - -ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, - int ray_index) -{ - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - ShaderData *sd = kernel_split_sd(sd, ray_index); - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - - /* GPU: no decoupled ray marching, scatter probabilistically. */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f / num_samples; - - Ray volume_ray = branched_state->ray; - volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? - branched_state->isect.t : - FLT_MAX; - - float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack); - - for (int j = branched_state->next_sample; j < num_samples; j++) { - ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; - *ps = branched_state->path_state; - - ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; - *pray = branched_state->ray; - - ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; - *tp = branched_state->throughput * num_samples_inv; - - /* branch RNG state */ - path_state_branch(ps, j, num_samples); - - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, ps, sd, &volume_ray, L, tp, step_size); - -# ifdef __VOLUME_SCATTER__ - if (result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L); - - /* indirect light bounce */ - if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) { - continue; - } - - /* start the indirect path */ - branched_state->next_closure = 0; - branched_state->next_sample = j + 1; - - /* Attempting to share too many samples is slow for volumes as it causes us to - * loop here more and have many calls to kernel_volume_integrate which evaluates - * shaders. The many expensive shader evaluations cause the work load to become - * unbalanced and many threads to become idle in this kernel. Limiting the - * number of shared samples here helps quite a lot. - */ - if (branched_state->shared_sample_count < 2) { - if (kernel_split_branched_indirect_start_shared(kg, ray_index)) { - continue; - } - } - - return true; - } -# endif - } - - branched_state->next_sample = num_samples; - - branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); - if (branched_state->waiting_on_shared_samples) { - return true; - } - - kernel_split_branched_path_indirect_loop_end(kg, ray_index); - - /* todo: avoid this calculation using decoupled ray marching */ - float3 throughput = kernel_split_state.throughput[ray_index]; - kernel_volume_shadow( - kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); - kernel_split_state.throughput[ray_index] = throughput; - - return false; -} - -#endif /* __BRANCHED_PATH__ && __VOLUME__ */ - -ccl_device void kernel_do_volume(KernelGlobals *kg) -{ -#ifdef __VOLUME__ - /* We will empty this queue in this kernel. */ - if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { - kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; -# ifdef __BRANCHED_PATH__ - kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; -# endif /* __BRANCHED_PATH__ */ - } - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - - if (*kernel_split_params.use_queues_flag) { - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - } - - ccl_global char *ray_state = kernel_split_state.ray_state; - - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - - bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); - - /* Sanitize volume stack. */ - if (!hit) { - kernel_volume_clean_stack(kg, state->volume_stack); - } - /* volume attenuation, emission, scatter */ - if (state->volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = *ray; - volume_ray.t = (hit) ? isect->t : FLT_MAX; - -# ifdef __BRANCHED_PATH__ - if (!kernel_data.integrator.branched || - IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { -# endif /* __BRANCHED_PATH__ */ - float step_size = volume_stack_step_size(kg, state->volume_stack); - - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, step_size); - -# ifdef __VOLUME_SCATTER__ - if (result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); - - /* indirect light bounce */ - if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - kernel_split_path_end(kg, ray_index); - } - } -# endif /* __VOLUME_SCATTER__ */ - } - -# ifdef __BRANCHED_PATH__ - } - else { - kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); - - if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - } -# endif /* __BRANCHED_PATH__ */ - } - } - -# ifdef __BRANCHED_PATH__ - /* iter loop */ - ray_index = get_ray_index(kg, - ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), - QUEUE_VOLUME_INDIRECT_ITER, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - - if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); - path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); - - if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - } -# endif /* __BRANCHED_PATH__ */ - -#endif /* __VOLUME__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h deleted file mode 100644 index 745313f89f1..00000000000 --- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) -{ -#ifdef __BRANCHED_PATH__ - /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - - char enqueue_flag = 0; - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { - enqueue_flag = 1; - } - - enqueue_ray_index_local(ray_index, - QUEUE_INACTIVE_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); -#endif /* __BRANCHED_PATH__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h deleted file mode 100644 index 61722840b0b..00000000000 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel takes care of the logic to process "material of type holdout", - * indirect primitive emission, bsdf blurring, probabilistic path termination - * and AO. - * - * This kernels determines the rays for which a shadow_blocked() function - * associated with AO should be executed. Those rays for which a - * shadow_blocked() function for AO must be executed are marked with flag - * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue - * QUEUE_SHADOW_RAY_CAST_AO_RAYS - * - * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER - * - * Note on Queues: - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS - * and processes only the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and - * reach RAY_UPDATE_BUFFER state. These rays are enqueued into - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present - * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has - * been changed to RAY_UPDATE_BUFFER, there is no problem. - * - * State of queues when this kernel is called: - * At entry, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and - * RAY_REGENERATED rays - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_TO_REGENERATE rays. - * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. - * At exit, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, - * RAY_REGENERATED and RAY_UPDATE_BUFFER rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with - * flag RAY_SHADOW_RAY_CAST_AO - */ - -ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( - KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals) -{ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - locals->queue_atomics_bg = 0; - locals->queue_atomics_ao = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - -#ifdef __AO__ - char enqueue_flag = 0; -#endif - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (ray_index != QUEUE_EMPTY_SLOT) { - ccl_global PathState *state = 0x0; - float3 throughput; - - ccl_global char *ray_state = kernel_split_state.ray_state; - ShaderData *sd = kernel_split_sd(sd, ray_index); - - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - - throughput = kernel_split_state.throughput[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - - if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) { - kernel_split_path_end(kg, ray_index); - } - } - - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - /* Path termination. this is a strange place to put the termination, it's - * mainly due to the mixed in MIS that we use. gives too many unneeded - * shader evaluations, only need emission if we are going to terminate. - */ - float probability = path_state_continuation_probability(kg, state, throughput); - - if (probability == 0.0f) { - kernel_split_path_end(kg, ray_index); - } - else if (probability < 1.0f) { - float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); - if (terminate >= probability) { - kernel_split_path_end(kg, ray_index); - } - else { - kernel_split_state.throughput[ray_index] = throughput / probability; - } - } - -#ifdef __DENOISING_FEATURES__ - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - kernel_update_denoising_features(kg, sd, state, L); - } -#endif - } - -#ifdef __AO__ - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - /* ambient occlusion */ - if (kernel_data.integrator.use_ambient_occlusion) { - enqueue_flag = 1; - } - } -#endif /* __AO__ */ - } - -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - &locals->queue_atomics_ao, - kernel_split_state.queue_data, - kernel_split_params.queue_index); -#endif -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h deleted file mode 100644 index 6d500650cc0..00000000000 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_indirect_background(KernelGlobals *kg) -{ - ccl_global char *ray_state = kernel_split_state.ray_state; - - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - int ray_index; - - if (kernel_data.integrator.ao_bounces != INT_MAX) { - ray_index = get_ray_index(kg, - thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (ray_index != QUEUE_EMPTY_SLOT) { - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - if (path_state_ao_bounce(kg, state)) { - kernel_split_path_end(kg, ray_index); - } - } - } - } - - ray_index = get_ray_index(kg, - thread_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - - kernel_path_background(kg, state, ray, throughput, sd, buffer, L); - kernel_split_path_end(kg, ray_index); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h deleted file mode 100644 index 3f48f8d6f56..00000000000 --- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) -{ - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (thread_index == 0) { - /* We will empty both queues in this kernel. */ - kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - - int ray_index; - get_ray_index(kg, - thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - ray_index = get_ray_index(kg, - thread_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - -#ifdef __SUBSURFACE__ - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - ccl_global char *ray_state = kernel_split_state.ray_state; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - - if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if (ss_indirect->num_rays) { - kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - } -#endif /* __SUBSURFACE__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h deleted file mode 100644 index 7ecb099208d..00000000000 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND. - * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. - */ -ccl_device void kernel_lamp_emission(KernelGlobals *kg) -{ -#ifndef __VOLUME__ - /* We will empty this queue in this kernel. */ - if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { - kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } -#endif - /* Fetch use_queues_flag. */ - char local_use_queues_flag = *kernel_split_params.use_queues_flag; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (local_use_queues_flag) { - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, -#ifndef __VOLUME__ - 1 -#else - 0 -#endif - ); - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } - - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - - float3 throughput = kernel_split_state.throughput[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - - kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h deleted file mode 100644 index 320f6a414bf..00000000000 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/*This kernel takes care of setting up ray for the next iteration of - * path-iteration and accumulating radiance corresponding to AO and - * direct-lighting - * - * Ray state of rays that are terminated in this kernel are changed - * to RAY_UPDATE_BUFFER. - * - * Note on queues: - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS - * and processes only the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and - * reach RAY_UPDATE_BUFF state. These rays are enqueued into - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present - * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has - * been changed to RAY_UPDATE_BUFF, there is no problem. - * - * State of queues when this kernel is called: - * At entry, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, - * RAY_REGENERATED, RAY_UPDATE_BUFFER rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, - * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. - */ - -#ifdef __BRANCHED_PATH__ -ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) -{ - kernel_split_branched_path_indirect_loop_init(kg, ray_index); - - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); -} - -ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index) -{ - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - -# ifdef __VOLUME__ - if (!(sd->flag & SD_HAS_ONLY_VOLUME)) { -# endif - /* continue in case of transparency */ - *throughput *= shader_bsdf_transparency(kg, sd); - - if (is_zero(*throughput)) { - kernel_split_path_end(kg, ray_index); - return; - } - - /* Update Path State */ - path_state_next(kg, state, LABEL_TRANSPARENT); -# ifdef __VOLUME__ - } - else { - if (!path_state_volume_next(kg, state)) { - kernel_split_path_end(kg, ray_index); - return; - } - } -# endif - - ray->P = ray_offset(sd->P, -sd->Ng); - ray->t -= sd->ray_length; /* clipping works through transparent */ - -# ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; - ray->dD.dx = -sd->dI.dx; - ray->dD.dy = -sd->dI.dy; -# endif /* __RAY_DIFFERENTIALS__ */ - -# ifdef __VOLUME__ - /* enter/exit volume */ - kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); -# endif /* __VOLUME__ */ -} -#endif /* __BRANCHED_PATH__ */ - -ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) -{ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed at least once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - *kernel_split_params.use_queues_flag = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the - * previous kernel. - */ - kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - ccl_global char *ray_state = kernel_split_state.ray_state; - -#ifdef __VOLUME__ - /* Reactivate only volume rays here, most surface work was skipped. */ - if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); - } -#endif - - bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); - if (active) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - -#ifdef __BRANCHED_PATH__ - if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { -#endif - /* Compute direct lighting and next bounce. */ - if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) { - kernel_split_path_end(kg, ray_index); - } -#ifdef __BRANCHED_PATH__ - } - else if (sd->flag & SD_HAS_ONLY_VOLUME) { - kernel_split_branched_transparent_bounce(kg, ray_index); - } - else { - kernel_split_branched_indirect_light_init(kg, ray_index); - - if (kernel_split_branched_path_surface_indirect_light_iter( - kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - kernel_split_branched_path_indirect_loop_end(kg, ray_index); - kernel_split_branched_transparent_bounce(kg, ray_index); - } - } -#endif /* __BRANCHED_PATH__ */ - } - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - -#ifdef __BRANCHED_PATH__ - /* iter loop */ - if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { - kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; - } - - ray_index = get_ray_index(kg, - ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), - QUEUE_LIGHT_INDIRECT_ITER, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - - if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - - path_radiance_sum_indirect(L); - path_radiance_reset_indirect(L); - - if (kernel_split_branched_path_surface_indirect_light_iter( - kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - kernel_split_branched_path_indirect_loop_end(kg, ray_index); - kernel_split_branched_transparent_bounce(kg, ray_index); - } - } - -# ifdef __VOLUME__ - /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ - ccl_barrier(CCL_LOCAL_MEM_FENCE); - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - enqueue_ray_index_local( - ray_index, - QUEUE_VOLUME_INDIRECT_ITER, - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - -# endif /* __VOLUME__ */ - -# ifdef __SUBSURFACE__ - /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ - ccl_barrier(CCL_LOCAL_MEM_FENCE); - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - enqueue_ray_index_local( - ray_index, - QUEUE_SUBSURFACE_INDIRECT_ITER, - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); -# endif /* __SUBSURFACE__ */ -#endif /* __BRANCHED_PATH__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h deleted file mode 100644 index c686f46a0cd..00000000000 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel initializes structures needed in path-iteration kernels. - * This is the first kernel in ray-tracing logic. - * - * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE - */ -ccl_device void kernel_path_init(KernelGlobals *kg) -{ - int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); - - /* This is the first assignment to ray_state; - * So we don't use ASSIGN_RAY_STATE macro. - */ - kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; - - /* Get work. */ - ccl_global uint *work_pools = kernel_split_params.work_pools; - uint total_work_size = kernel_split_params.total_work_size; - uint work_index; - - if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { - /* No more work, mark ray as inactive */ - kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; - - return; - } - - ccl_global WorkTile *tile = &kernel_split_params.tile; - uint x, y, sample; - get_work_pixel(tile, work_index, &x, &y, &sample); - - /* Store buffer offset for writing to passes. */ - uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; - kernel_split_state.buffer_offset[ray_index] = buffer_offset; - - /* Initialize random numbers and ray. */ - uint rng_hash; - kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]); - - if (kernel_split_state.ray[ray_index].t != 0.0f) { - /* Initialize throughput, path radiance, Ray, PathState; - * These rays proceed with path-iteration. - */ - kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]); - path_state_init(kg, - AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), - &kernel_split_state.path_state[ray_index], - rng_hash, - sample, - &kernel_split_state.ray[ray_index]); -#ifdef __SUBSURFACE__ - kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); -#endif - } - else { - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h deleted file mode 100644 index 2db87f7a671..00000000000 --- a/intern/cycles/kernel/split/kernel_queue_enqueue.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel enqueues rays of different ray state into their - * appropriate queues: - * - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel are enqueued in - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in pat - * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * State of queue during other times this kernel is called: - * At entry, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE - * and RAY_UPDATE_BUFFER rays. - * At exit, - * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with - * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ -ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals) -{ - /* We have only 2 cases (Hit/Not-Hit) */ - int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - - if (lidx == 0) { - locals->queue_atomics[0] = 0; - locals->queue_atomics[1] = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } - else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if (queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics); - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - if (lidx == 0) { - locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset( - QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index); - locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset( - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - locals->queue_atomics, - kernel_split_params.queue_index); - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - unsigned int my_gqidx; - if (queue_number != -1) { - my_gqidx = get_global_queue_index( - queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics); - kernel_split_state.queue_data[my_gqidx] = ray_index; - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h deleted file mode 100644 index 9ac95aafd2f..00000000000 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel takes care of scene_intersect function. - * - * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. - * This kernel processes rays of ray state RAY_ACTIVE - * This kernel determines the rays that have hit the background and changes - * their ray state to RAY_HIT_BACKGROUND. - */ -ccl_device void kernel_scene_intersect(KernelGlobals *kg) -{ - /* Fetch use_queues_flag */ - char local_use_queues_flag = *kernel_split_params.use_queues_flag; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (local_use_queues_flag) { - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } - - /* All regenerated rays become active here */ - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { -#ifdef __BRANCHED_PATH__ - if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { - kernel_split_path_end(kg, ray_index); - } - else -#endif /* __BRANCHED_PATH__ */ - { - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); - } - } - - if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { - return; - } - - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - - Intersection isect; - const int last_object = state->bounce > 0 ? - intersection_get_object(kg, &kernel_split_state.isect[ray_index]) : - OBJECT_NONE; - bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object); - kernel_split_state.isect[ray_index] = isect; - - if (!hit) { - /* Change the state of rays that hit the background; - * These rays undergo special processing in the - * background_bufferUpdate kernel. - */ - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h deleted file mode 100644 index c760a2b2049..00000000000 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel evaluates ShaderData structure from the values computed - * by the previous kernels. - */ -ccl_device void kernel_shader_eval(KernelGlobals *kg) -{ - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - /* Sorting on cuda split is not implemented */ -#ifdef __KERNEL_CUDA__ - int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; -#else - int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; -#endif - if (ray_index >= queue_index) { - return; - } - ray_index = get_ray_index(kg, - ray_index, -#ifdef __KERNEL_CUDA__ - QUEUE_ACTIVE_AND_REGENERATED_RAYS, -#else - QUEUE_SHADER_SORTED_RAYS, -#endif - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - ccl_global char *ray_state = kernel_split_state.ray_state; - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - - shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag); -#ifdef __BRANCHED_PATH__ - if (kernel_data.integrator.branched) { - shader_merge_closures(kernel_split_sd(sd, ray_index)); - } - else -#endif - { - shader_prepare_closures(kernel_split_sd(sd, ray_index), state); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h deleted file mode 100644 index 551836d1653..00000000000 --- a/intern/cycles/kernel/split/kernel_shader_setup.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* This kernel sets up the ShaderData structure from the values computed - * by the previous kernels. - * - * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them - * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - */ -ccl_device void kernel_shader_setup(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) -{ - /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; - if (ray_index < queue_index) { - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 0); - } - else { - ray_index = QUEUE_EMPTY_SLOT; - } - - char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : - 0; - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - - /* Continue on with shader evaluation. */ - if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { - Intersection isect = kernel_split_state.isect[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - - shader_setup_from_ray(kg, sd, &isect, &ray); - -#ifdef __VOLUME__ - if (sd->flag & SD_HAS_ONLY_VOLUME) { - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME); - } -#endif - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h deleted file mode 100644 index 95d33a42014..00000000000 --- a/intern/cycles/kernel/split/kernel_shader_sort.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals) -{ -#ifndef __KERNEL_CUDA__ - int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; - if (tid == 0) { - kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; - } - - uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE; - if (offset >= qsize) { - return; - } - - int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); - uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); - uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); - ccl_local uint *local_value = &locals->local_value[0]; - ccl_local ushort *local_index = &locals->local_index[0]; - - /* copy to local memory */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { - uint idx = offset + i + lid; - uint add = input + idx; - uint value = (~0); - if (idx < qsize) { - int ray_index = kernel_split_state.queue_data[add]; - bool valid = (ray_index != QUEUE_EMPTY_SLOT) && - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); - if (valid) { - value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK; - } - } - local_value[i + lid] = value; - local_index[i + lid] = i + lid; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - /* skip sorting for cpu split kernel */ -# ifdef __KERNEL_OPENCL__ - - /* bitonic sort */ - for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { - for (uint inc = length; inc > 0; inc >>= 1) { - for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { - uint i = lid + ii; - bool direction = ((i & (length << 1)) != 0); - uint j = i ^ inc; - ushort ioff = local_index[i]; - ushort joff = local_index[j]; - uint iKey = local_value[ioff]; - uint jKey = local_value[joff]; - bool smaller = (jKey < iKey) || (jKey == iKey && j < i); - bool swap = smaller ^ (j < i) ^ direction; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - local_index[i] = (swap) ? joff : ioff; - local_index[j] = (swap) ? ioff : joff; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - } - } - } -# endif /* __KERNEL_OPENCL__ */ - - /* copy to destination */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { - uint idx = offset + i + lid; - uint lidx = local_index[i + lid]; - uint outi = output + idx; - uint ini = input + offset + lidx; - uint value = local_value[lidx]; - if (idx < qsize) { - kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : - kernel_split_state.queue_data[ini]; - } - } -#endif /* __KERNEL_CUDA__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h deleted file mode 100644 index 5d772fc597b..00000000000 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Shadow ray cast for AO. */ -ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) -{ - unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (thread_index < ao_queue_length) { - ray_index = get_ray_index(kg, - thread_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - } - - if (ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - ShaderData *sd = kernel_split_sd(sd, ray_index); - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - -#ifdef __BRANCHED_PATH__ - if (!kernel_data.integrator.branched || - IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { -#endif - kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd)); -#ifdef __BRANCHED_PATH__ - } - else { - kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput); - } -#endif -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h deleted file mode 100644 index 5e46d300bca..00000000000 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Shadow ray cast for direct visible light. */ -ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) -{ - unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - ccl_barrier(CCL_LOCAL_MEM_FENCE); - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (thread_index < dl_queue_length) { - ray_index = get_ray_index(kg, - thread_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - } - -#ifdef __BRANCHED_PATH__ - /* TODO(mai): move this somewhere else? */ - if (thread_index == 0) { - /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ - kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; - } -#endif /* __BRANCHED_PATH__ */ - - if (ray_index == QUEUE_EMPTY_SLOT) - return; - - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - Ray ray = kernel_split_state.light_ray[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - float3 throughput = kernel_split_state.throughput[ray_index]; - - BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - bool is_lamp = kernel_split_state.is_lamp[ray_index]; - -#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) - bool use_branched = false; - int all = 0; - - if (state->flag & PATH_RAY_SHADOW_CATCHER) { - use_branched = true; - all = 1; - } -# if defined(__BRANCHED_PATH__) - else if (kernel_data.integrator.branched) { - use_branched = true; - - if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - all = (kernel_data.integrator.sample_all_lights_indirect); - } - else { - all = (kernel_data.integrator.sample_all_lights_direct); - } - } -# endif /* __BRANCHED_PATH__ */ - - if (use_branched) { - kernel_branched_path_surface_connect_light( - kg, sd, emission_sd, state, throughput, 1.0f, L, all); - } - else -#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ - { - /* trace shadow ray */ - float3 shadow; - - if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) { - /* accumulate */ - path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp); - } - else { - path_radiance_accum_total_light(L, state, throughput, &L_light); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h deleted file mode 100644 index 5114f2b03e5..00000000000 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KERNEL_SPLIT_H__ -#define __KERNEL_SPLIT_H__ - -// clang-format off -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" - -#include "kernel/split/kernel_split_data.h" - -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" - -#ifdef __OSL__ -# include "kernel/osl/osl_shader.h" -#endif - -#ifdef __KERNEL_OPENCL__ -# include "kernel/kernels/opencl/kernel_opencl_image.h" -#endif -#ifdef __KERNEL_CUDA__ -# include "kernel/kernels/cuda/kernel_cuda_image.h" -#endif -#ifdef __KERNEL_CPU__ -# include "kernel/kernels/cpu/kernel_cpu_image.h" -#endif - -#include "util/util_atomic.h" - -#include "kernel/kernel_path.h" -#ifdef __BRANCHED_PATH__ -# include "kernel/kernel_path_branched.h" -#endif - -#include "kernel/kernel_queues.h" -#include "kernel/kernel_work_stealing.h" - -#ifdef __BRANCHED_PATH__ -# include "kernel/split/kernel_branched.h" -#endif -// clang-format on - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) -{ - ccl_global char *ray_state = kernel_split_state.ray_state; - -#ifdef __BRANCHED_PATH__ -# ifdef __SUBSURFACE__ - ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - - if (ss_indirect->num_rays) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - else -# endif /* __SUBSURFACE__ */ - if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { - int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; - - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; - - path_radiance_sum_indirect(L); - path_radiance_accum_sample(orig_ray_L, L); - - atomic_fetch_and_dec_uint32( - (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } - else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); - } - else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); - } - else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); - } - else { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } -#else - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); -#endif -} - -CCL_NAMESPACE_END - -#endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h deleted file mode 100644 index decc537b39b..00000000000 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KERNEL_SPLIT_DATA_H__ -#define __KERNEL_SPLIT_DATA_H__ - -#include "kernel/split/kernel_split_data_types.h" - -#include "kernel/kernel_globals.h" - -CCL_NAMESPACE_BEGIN - -ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) -{ - (void)kg; /* Unused on CPU. */ - - uint64_t size = 0; -#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16) - size = size SPLIT_DATA_ENTRIES; -#undef SPLIT_DATA_ENTRY - - uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1); - -#ifdef __BRANCHED_PATH__ - size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); -#endif - - size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); - - return size; -} - -ccl_device_inline void split_data_init(KernelGlobals *kg, - ccl_global SplitData *split_data, - size_t num_elements, - ccl_global void *data, - ccl_global char *ray_state) -{ - (void)kg; /* Unused on CPU. */ - - ccl_global char *p = (ccl_global char *)data; - -#define SPLIT_DATA_ENTRY(type, name, num) \ - split_data->name = (type *)p; \ - p += align_up(num_elements * num * sizeof(type), 16); - SPLIT_DATA_ENTRIES; -#undef SPLIT_DATA_ENTRY - - uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1); - -#ifdef __BRANCHED_PATH__ - split_data->_branched_state_sd = (ShaderData *)p; - p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); -#endif - - split_data->_sd = (ShaderData *)p; - p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); - - split_data->ray_state = ray_state; -} - -CCL_NAMESPACE_END - -#endif /* __KERNEL_SPLIT_DATA_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h deleted file mode 100644 index 06bdce9947d..00000000000 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KERNEL_SPLIT_DATA_TYPES_H__ -#define __KERNEL_SPLIT_DATA_TYPES_H__ - -CCL_NAMESPACE_BEGIN - -/* parameters used by the split kernels, we use a single struct to avoid passing these to each - * kernel */ - -typedef struct SplitParams { - WorkTile tile; - uint total_work_size; - - ccl_global unsigned int *work_pools; - - ccl_global int *queue_index; - int queue_size; - ccl_global char *use_queues_flag; - - /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ - int dummy_sd_flag; -} SplitParams; - -/* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - -/* SPLIT_DATA_ENTRY(type, name, num) */ - -#ifdef __BRANCHED_PATH__ - -typedef ccl_global struct SplitBranchedState { - /* various state that must be kept and restored after an indirect loop */ - PathState path_state; - float3 throughput; - Ray ray; - - Intersection isect; - - char ray_state; - - /* indirect loop state */ - int next_closure; - int next_sample; - -# ifdef __SUBSURFACE__ - int ss_next_closure; - int ss_next_sample; - int next_hit; - int num_hits; - - uint lcg_state; - LocalIntersection ss_isect; -# endif /* __SUBSURFACE__ */ - - int shared_sample_count; /* number of branched samples shared with other threads */ - int original_ray; /* index of original ray when sharing branched samples */ - bool waiting_on_shared_samples; -} SplitBranchedState; - -# define SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \ - SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0) -#else -# define SPLIT_DATA_BRANCHED_ENTRIES -#endif /* __BRANCHED_PATH__ */ - -#ifdef __SUBSURFACE__ -# define SPLIT_DATA_SUBSURFACE_ENTRIES \ - SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) -#else -# define SPLIT_DATA_SUBSURFACE_ENTRIES -#endif /* __SUBSURFACE__ */ - -#ifdef __VOLUME__ -# define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) -#else -# define SPLIT_DATA_VOLUME_ENTRIES -#endif /* __VOLUME__ */ - -#define SPLIT_DATA_ENTRIES \ - SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ - SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ - SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ - SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ - SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ - SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ - SPLIT_DATA_ENTRY( \ - ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \ - SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ - SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \ - SPLIT_DATA_SUBSURFACE_ENTRIES \ - SPLIT_DATA_VOLUME_ENTRIES \ - SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_ENTRY(ShaderData, _sd, 0) - -/* Entries to be copied to inactive rays when sharing branched samples - * (TODO: which are actually needed?) */ -#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ - SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ - SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ - SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ - SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ - SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ - SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ - SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \ - SPLIT_DATA_SUBSURFACE_ENTRIES \ - SPLIT_DATA_VOLUME_ENTRIES \ - SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_ENTRY(ShaderData, _sd, 0) - -/* struct that holds pointers to data in the shared state buffer */ -typedef struct SplitData { -#define SPLIT_DATA_ENTRY(type, name, num) type *name; - SPLIT_DATA_ENTRIES -#undef SPLIT_DATA_ENTRY - - /* this is actually in a separate buffer from the rest of the split state data (so it can be read - * back from the host easily) but is still used the same as the other data so we have it here in - * this struct as well - */ - ccl_global char *ray_state; -} SplitData; - -#ifndef __KERNEL_CUDA__ -# define kernel_split_state (kg->split_data) -# define kernel_split_params (kg->split_param_data) -#else -__device__ SplitData __split_data; -# define kernel_split_state (__split_data) -__device__ SplitParams __split_param_data; -# define kernel_split_params (__split_param_data) -#endif /* __KERNEL_CUDA__ */ - -#define kernel_split_sd(sd, ray_index) \ - ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \ - (sizeof(ShaderData) + \ - sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \ - (ray_index))) - -/* Local storage for queue_enqueue kernel. */ -typedef struct QueueEnqueueLocals { - uint queue_atomics[2]; -} QueueEnqueueLocals; - -/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */ -typedef struct BackgroundAOLocals { - uint queue_atomics_bg; - uint queue_atomics_ao; -} BackgroundAOLocals; - -typedef struct ShaderSortLocals { - uint local_value[SHADER_SORT_BLOCK_SIZE]; - ushort local_index[SHADER_SORT_BLOCK_SIZE]; -} ShaderSortLocals; - -CCL_NAMESPACE_END - -#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h deleted file mode 100644 index ba06ae3bc53..00000000000 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) - -ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, - int ray_index) -{ - kernel_split_branched_path_indirect_loop_init(kg, ray_index); - - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - branched_state->ss_next_closure = 0; - branched_state->ss_next_sample = 0; - - branched_state->num_hits = 0; - branched_state->next_hit = 0; - - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); -} - -ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter( - KernelGlobals *kg, int ray_index) -{ - SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; - - ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index); - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - - for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if (!CLOSURE_IS_BSSRDF(sc->type)) - continue; - - /* Closure memory will be overwritten, so read required variables now. */ - Bssrdf *bssrdf = (Bssrdf *)sc; - ClosureType bssrdf_type = sc->type; - float bssrdf_roughness = bssrdf->roughness; - - /* set up random number generator */ - if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && - branched_state->next_closure == 0 && branched_state->next_sample == 0) { - branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state, - 0x68bc21eb); - } - int num_samples = kernel_data.integrator.subsurface_samples * 3; - float num_samples_inv = 1.0f / num_samples; - uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); - - /* do subsurface scatter step with copy of shader data, this will - * replace the BSSRDF with a diffuse BSDF closure */ - for (int j = branched_state->ss_next_sample; j < num_samples; j++) { - ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; - *hit_state = branched_state->path_state; - hit_state->rng_hash = bssrdf_rng_hash; - path_state_branch(hit_state, j, num_samples); - - ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect; - float bssrdf_u, bssrdf_v; - path_branched_rng_2D( - kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - - /* intersection is expensive so avoid doing multiple times for the same input */ - if (branched_state->next_hit == 0 && branched_state->next_closure == 0 && - branched_state->next_sample == 0) { - uint lcg_state = branched_state->lcg_state; - LocalIntersection ss_isect_private; - - branched_state->num_hits = subsurface_scatter_multi_intersect( - kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true); - - branched_state->lcg_state = lcg_state; - *ss_isect = ss_isect_private; - } - - hit_state->rng_offset += PRNG_BOUNCE_NUM; - -# ifdef __VOLUME__ - Ray volume_ray = branched_state->ray; - bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* compute lighting with the BSDF closure */ - for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { - ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index); - *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is - * important as the indirect path will write into bssrdf_sd */ - - LocalIntersection ss_isect_private = *ss_isect; - subsurface_scatter_multi_setup( - kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness); - *ss_isect = ss_isect_private; - -# ifdef __VOLUME__ - if (need_update_volume_stack) { - /* Setup ray from previous surface point to the new one. */ - float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); - volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); - - for (int k = 0; k < VOLUME_STACK_SIZE; k++) { - hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k]; - } - - kernel_volume_stack_update_for_subsurface( - kg, emission_sd, &volume_ray, hit_state->volume_stack); - } -# endif /* __VOLUME__ */ - -# ifdef __EMISSION__ - if (branched_state->next_closure == 0 && branched_state->next_sample == 0) { - /* direct light */ - if (kernel_data.integrator.use_direct_light) { - int all = (kernel_data.integrator.sample_all_lights_direct) || - (hit_state->flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light(kg, - bssrdf_sd, - emission_sd, - hit_state, - branched_state->throughput, - num_samples_inv, - L, - all); - } - } -# endif /* __EMISSION__ */ - - /* indirect light */ - if (kernel_split_branched_path_surface_indirect_light_iter( - kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) { - branched_state->ss_next_closure = i; - branched_state->ss_next_sample = j; - branched_state->next_hit = hit; - - return true; - } - - branched_state->next_closure = 0; - } - - branched_state->next_hit = 0; - } - - branched_state->ss_next_sample = 0; - } - - branched_state->ss_next_closure = sd->num_closure; - - branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); - if (branched_state->waiting_on_shared_samples) { - return true; - } - - kernel_split_branched_path_indirect_loop_end(kg, ray_index); - - return false; -} - -#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ - -ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) -{ - int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if (thread_index == 0) { - /* We will empty both queues in this kernel. */ - kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - - int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - ray_index = get_ray_index(kg, - ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - get_ray_index(kg, - thread_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - -#ifdef __SUBSURFACE__ - ccl_global char *ray_state = kernel_split_state.ray_state; - - if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - ShaderData *sd = kernel_split_sd(sd, ray_index); - ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); - - if (sd->flag & SD_BSSRDF) { - -# ifdef __BRANCHED_PATH__ - if (!kernel_data.integrator.branched || - IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { -# endif - if (kernel_path_subsurface_scatter( - kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) { - kernel_split_path_end(kg, ray_index); - } -# ifdef __BRANCHED_PATH__ - } - else { - kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); - - if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - } -# endif - } - } - -# ifdef __BRANCHED_PATH__ - if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { - kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; - } - - /* iter loop */ - ray_index = get_ray_index(kg, - ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), - QUEUE_SUBSURFACE_INDIRECT_ITER, - kernel_split_state.queue_data, - kernel_split_params.queue_size, - 1); - - if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); - path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); - - if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - } -# endif /* __BRANCHED_PATH__ */ - -#endif /* __SUBSURFACE__ */ -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 000da1fa615..4aee1ef11b3 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a) { kernel_assert(a + 2 < SVM_STACK_SIZE); - return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]); + float *stack_a = stack + a; + return make_float3(stack_a[0], stack_a[1], stack_a[2]); } ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f) { kernel_assert(a + 2 < SVM_STACK_SIZE); - stack[a + 0] = f.x; - stack[a + 1] = f.y; - stack[a + 2] = f.z; + float *stack_a = stack + a; + stack_a[0] = f.x; + stack_a[1] = f.y; + stack_a[2] = f.z; } ccl_device_inline float stack_load_float(float *stack, uint a) @@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a) /* Reading Nodes */ -ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset) +ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset) { uint4 node = kernel_tex_fetch(__svm_nodes, *offset); (*offset)++; return node; } -ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset) +ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset) { uint4 node = kernel_tex_fetch(__svm_nodes, *offset); float4 f = make_float4(__uint_as_float(node.x), @@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset) return f; } -ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset) +ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset) { uint4 node = kernel_tex_fetch(__svm_nodes, offset); return make_float4(__uint_as_float(node.x), @@ -217,26 +219,11 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN /* Main Interpreter Loop */ -#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__) -ccl_device_inline void svm_eval_nodes(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_global float *buffer, - ShaderType type, - int path_flag) -{ - optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag); -} -extern "C" __device__ void __direct_callable__svm_eval_nodes( -#else -ccl_device_noinline void svm_eval_nodes( -#endif - KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - ccl_global float *buffer, - ShaderType type, - int path_flag) +template<uint node_feature_mask, ShaderType type> +ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *sd, + ccl_global float *render_buffer, + int path_flag) { float stack[SVM_STACK_SIZE]; int offset = sd->shader & SHADER_MASK; @@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes( switch (node.x) { case NODE_END: return; -#if NODES_GROUP(NODE_GROUP_LEVEL_0) case NODE_SHADER_JUMP: { if (type == SHADER_TYPE_SURFACE) offset = node.y; @@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes( break; } case NODE_CLOSURE_BSDF: - svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset); + offset = svm_node_closure_bsdf<node_feature_mask, type>( + kg, sd, stack, node, path_flag, offset); break; case NODE_CLOSURE_EMISSION: - svm_node_closure_emission(sd, stack, node); + if (KERNEL_NODES_FEATURE(EMISSION)) { + svm_node_closure_emission(sd, stack, node); + } break; case NODE_CLOSURE_BACKGROUND: - svm_node_closure_background(sd, stack, node); + if (KERNEL_NODES_FEATURE(EMISSION)) { + svm_node_closure_background(sd, stack, node); + } break; case NODE_CLOSURE_SET_WEIGHT: svm_node_closure_set_weight(sd, node.y, node.z, node.w); @@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes( svm_node_closure_weight(sd, stack, node.y); break; case NODE_EMISSION_WEIGHT: - svm_node_emission_weight(kg, sd, stack, node); + if (KERNEL_NODES_FEATURE(EMISSION)) { + svm_node_emission_weight(kg, sd, stack, node); + } break; case NODE_MIX_CLOSURE: svm_node_mix_closure(sd, stack, node); @@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes( svm_node_convert(kg, sd, stack, node.y, node.z, node.w); break; case NODE_TEX_COORD: - svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset); + offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); break; case NODE_VALUE_F: svm_node_value_f(kg, sd, stack, node.y, node.z); break; case NODE_VALUE_V: - svm_node_value_v(kg, sd, stack, node.y, &offset); + offset = svm_node_value_v(kg, sd, stack, node.y, offset); break; case NODE_ATTR: - svm_node_attr(kg, sd, stack, node); + svm_node_attr<node_feature_mask>(kg, sd, stack, node); break; case NODE_VERTEX_COLOR: svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w); break; -# if NODES_FEATURE(NODE_FEATURE_BUMP) case NODE_GEOMETRY_BUMP_DX: - svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); + } break; case NODE_GEOMETRY_BUMP_DY: - svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); + } break; case NODE_SET_DISPLACEMENT: - svm_node_set_displacement(kg, sd, stack, node.y); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_set_displacement(kg, sd, stack, node.y); + } break; case NODE_DISPLACEMENT: - svm_node_displacement(kg, sd, stack, node); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_displacement(kg, sd, stack, node); + } break; case NODE_VECTOR_DISPLACEMENT: - svm_node_vector_displacement(kg, sd, stack, node, &offset); + if (KERNEL_NODES_FEATURE(BUMP)) { + offset = svm_node_vector_displacement(kg, sd, stack, node, offset); + } break; -# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ case NODE_TEX_IMAGE: - svm_node_tex_image(kg, sd, stack, node, &offset); + offset = svm_node_tex_image(kg, sd, stack, node, offset); break; case NODE_TEX_IMAGE_BOX: svm_node_tex_image_box(kg, sd, stack, node); break; case NODE_TEX_NOISE: - svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset); break; -# if NODES_FEATURE(NODE_FEATURE_BUMP) case NODE_SET_BUMP: - svm_node_set_bump(kg, sd, stack, node); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_set_bump(kg, sd, stack, node); + } break; case NODE_ATTR_BUMP_DX: - svm_node_attr_bump_dx(kg, sd, stack, node); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_attr_bump_dx(kg, sd, stack, node); + } break; case NODE_ATTR_BUMP_DY: - svm_node_attr_bump_dy(kg, sd, stack, node); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_attr_bump_dy(kg, sd, stack, node); + } break; case NODE_VERTEX_COLOR_BUMP_DX: - svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w); + } break; case NODE_VERTEX_COLOR_BUMP_DY: - svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w); + } break; case NODE_TEX_COORD_BUMP_DX: - svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset); + if (KERNEL_NODES_FEATURE(BUMP)) { + offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset); + } break; case NODE_TEX_COORD_BUMP_DY: - svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset); + if (KERNEL_NODES_FEATURE(BUMP)) { + offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset); + } break; case NODE_CLOSURE_SET_NORMAL: - svm_node_set_normal(kg, sd, stack, node.y, node.z); + if (KERNEL_NODES_FEATURE(BUMP)) { + svm_node_set_normal(kg, sd, stack, node.y, node.z); + } break; -# if NODES_FEATURE(NODE_FEATURE_BUMP_STATE) case NODE_ENTER_BUMP_EVAL: - svm_node_enter_bump_eval(kg, sd, stack, node.y); + if (KERNEL_NODES_FEATURE(BUMP_STATE)) { + svm_node_enter_bump_eval(kg, sd, stack, node.y); + } break; case NODE_LEAVE_BUMP_EVAL: - svm_node_leave_bump_eval(kg, sd, stack, node.y); + if (KERNEL_NODES_FEATURE(BUMP_STATE)) { + svm_node_leave_bump_eval(kg, sd, stack, node.y); + } break; -# endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */ -# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ case NODE_HSV: - svm_node_hsv(kg, sd, stack, node, &offset); + svm_node_hsv(kg, sd, stack, node); break; -#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */ -#if NODES_GROUP(NODE_GROUP_LEVEL_1) case NODE_CLOSURE_HOLDOUT: svm_node_closure_holdout(sd, stack, node); break; @@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes( case NODE_LAYER_WEIGHT: svm_node_layer_weight(sd, stack, node); break; -# if NODES_FEATURE(NODE_FEATURE_VOLUME) case NODE_CLOSURE_VOLUME: - svm_node_closure_volume(kg, sd, stack, node, type); + if (KERNEL_NODES_FEATURE(VOLUME)) { + svm_node_closure_volume<type>(kg, sd, stack, node); + } break; case NODE_PRINCIPLED_VOLUME: - svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset); + if (KERNEL_NODES_FEATURE(VOLUME)) { + offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset); + } break; -# endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */ case NODE_MATH: - svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset); + svm_node_math(kg, sd, stack, node.y, node.z, node.w); break; case NODE_VECTOR_MATH: - svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_RGB_RAMP: - svm_node_rgb_ramp(kg, sd, stack, node, &offset); + offset = svm_node_rgb_ramp(kg, sd, stack, node, offset); break; case NODE_GAMMA: svm_node_gamma(sd, stack, node.y, node.z, node.w); @@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes( svm_node_brightness(sd, stack, node.y, node.z, node.w); break; case NODE_LIGHT_PATH: - svm_node_light_path(sd, state, stack, node.y, node.z, path_flag); + svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag); break; case NODE_OBJECT_INFO: svm_node_object_info(kg, sd, stack, node.y, node.z); @@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes( case NODE_PARTICLE_INFO: svm_node_particle_info(kg, sd, stack, node.y, node.z); break; -# if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR) +#if defined(__HAIR__) case NODE_HAIR_INFO: - svm_node_hair_info(kg, sd, stack, node.y, node.z); + if (KERNEL_NODES_FEATURE(HAIR)) { + svm_node_hair_info(kg, sd, stack, node.y, node.z); + } break; -# endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */ -#endif /* NODES_GROUP(NODE_GROUP_LEVEL_1) */ +#endif -#if NODES_GROUP(NODE_GROUP_LEVEL_2) case NODE_TEXTURE_MAPPING: - svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset); + offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset); break; case NODE_MAPPING: - svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset); + svm_node_mapping(kg, sd, stack, node.y, node.z, node.w); break; case NODE_MIN_MAX: - svm_node_min_max(kg, sd, stack, node.y, node.z, &offset); + offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset); break; case NODE_CAMERA: svm_node_camera(kg, sd, stack, node.y, node.z, node.w); @@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes( svm_node_tex_environment(kg, sd, stack, node); break; case NODE_TEX_SKY: - svm_node_tex_sky(kg, sd, stack, node, &offset); + offset = svm_node_tex_sky(kg, sd, stack, node, offset); break; case NODE_TEX_GRADIENT: svm_node_tex_gradient(sd, stack, node); break; case NODE_TEX_VORONOI: - svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_tex_voronoi<node_feature_mask>( + kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_TEX_MUSGRAVE: - svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_TEX_WAVE: - svm_node_tex_wave(kg, sd, stack, node, &offset); + offset = svm_node_tex_wave(kg, sd, stack, node, offset); break; case NODE_TEX_MAGIC: - svm_node_tex_magic(kg, sd, stack, node, &offset); + offset = svm_node_tex_magic(kg, sd, stack, node, offset); break; case NODE_TEX_CHECKER: svm_node_tex_checker(kg, sd, stack, node); break; case NODE_TEX_BRICK: - svm_node_tex_brick(kg, sd, stack, node, &offset); + offset = svm_node_tex_brick(kg, sd, stack, node, offset); break; case NODE_TEX_WHITE_NOISE: - svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset); + svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w); break; case NODE_NORMAL: - svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_LIGHT_FALLOFF: svm_node_light_falloff(sd, stack, node); break; case NODE_IES: - svm_node_ies(kg, sd, stack, node, &offset); + svm_node_ies(kg, sd, stack, node); break; -#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */ -#if NODES_GROUP(NODE_GROUP_LEVEL_3) case NODE_RGB_CURVES: case NODE_VECTOR_CURVES: - svm_node_curves(kg, sd, stack, node, &offset); + offset = svm_node_curves(kg, sd, stack, node, offset); break; case NODE_TANGENT: svm_node_tangent(kg, sd, stack, node); @@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes( svm_node_invert(sd, stack, node.y, node.z, node.w); break; case NODE_MIX: - svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_SEPARATE_VECTOR: svm_node_separate_vector(sd, stack, node.y, node.z, node.w); @@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes( svm_node_combine_vector(sd, stack, node.y, node.z, node.w); break; case NODE_SEPARATE_HSV: - svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_COMBINE_HSV: - svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_VECTOR_ROTATE: svm_node_vector_rotate(sd, stack, node.y, node.z, node.w); @@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes( svm_node_blackbody(kg, sd, stack, node.y, node.z); break; case NODE_MAP_RANGE: - svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset); break; case NODE_CLAMP: - svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset); + offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset); break; -# ifdef __SHADER_RAYTRACE__ +#ifdef __SHADER_RAYTRACE__ case NODE_BEVEL: - svm_node_bevel(kg, sd, state, stack, node); + svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node); break; case NODE_AMBIENT_OCCLUSION: - svm_node_ao(kg, sd, state, stack, node); + svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node); break; -# endif /* __SHADER_RAYTRACE__ */ -#endif /* NODES_GROUP(NODE_GROUP_LEVEL_3) */ +#endif -#if NODES_GROUP(NODE_GROUP_LEVEL_4) -# if NODES_FEATURE(NODE_FEATURE_VOLUME) case NODE_TEX_VOXEL: - svm_node_tex_voxel(kg, sd, stack, node, &offset); + if (KERNEL_NODES_FEATURE(VOLUME)) { + offset = svm_node_tex_voxel(kg, sd, stack, node, offset); + } break; -# endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */ case NODE_AOV_START: - if (!svm_node_aov_check(state, buffer)) { + if (!svm_node_aov_check(path_flag, render_buffer)) { return; } break; case NODE_AOV_COLOR: - svm_node_aov_color(kg, sd, stack, node, buffer); + svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer); break; case NODE_AOV_VALUE: - svm_node_aov_value(kg, sd, stack, node, buffer); + svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer); break; -#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */ default: kernel_assert(!"Unknown node type was passed to the SVM machine"); return; diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h index 4cb986b897a..34ac2cb8fbf 100644 --- a/intern/cycles/kernel/svm/svm_ao.h +++ b/intern/cycles/kernel/svm/svm_ao.h @@ -14,20 +14,25 @@ * limitations under the License. */ +#include "kernel/bvh/bvh.h" + CCL_NAMESPACE_BEGIN #ifdef __SHADER_RAYTRACE__ -ccl_device_noinline float svm_ao(KernelGlobals *kg, - ShaderData *sd, - float3 N, - ccl_addr_space PathState *state, - float max_dist, - int num_samples, - int flags) +# ifdef __KERNEL_OPTIX__ +extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, +# else +ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS, +# endif + ShaderData *sd, + float3 N, + float max_dist, + int num_samples, + int flags) { if (flags & NODE_AO_GLOBAL_RADIUS) { - max_dist = kernel_data.background.ao_distance; + max_dist = kernel_data.integrator.ao_bounces_distance; } /* Early out if no sampling needed. */ @@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg, float3 T, B; make_orthonormals(N, &T, &B); + /* TODO: support ray-tracing in shadow shader evaluation? */ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + int unoccluded = 0; for (int sample = 0; sample < num_samples; sample++) { float disk_u, disk_v; - path_branched_rng_2D( - kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v); + path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v); float2 d = concentric_sample_disk(disk_u, disk_v); float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d))); @@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg, ray.D = D.x * T + D.y * B + D.z * N; ray.t = max_dist; ray.time = sd->time; - ray.dP = sd->dP; - ray.dD = differential3_zero(); + ray.dP = differential_zero_compact(); + ray.dD = differential_zero_compact(); if (flags & NODE_AO_ONLY_LOCAL) { if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) { @@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg, return ((float)unoccluded) / num_samples; } -ccl_device void svm_node_ao( - KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node) +template<uint node_feature_mask> +# if defined(__KERNEL_OPTIX__) +ccl_device_inline +# else +ccl_device_noinline +# endif + void + svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node) { uint flags, dist_offset, normal_offset, out_ao_offset; svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset); @@ -92,7 +106,16 @@ ccl_device void svm_node_ao( float dist = stack_load_float_default(stack, dist_offset, node.w); float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N; - float ao = svm_ao(kg, sd, normal, state, dist, samples, flags); + + float ao = 1.0f; + + if (KERNEL_NODES_FEATURE(RAYTRACE)) { +# ifdef __KERNEL_OPTIX__ + ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags); +# else + ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags); +# endif + } if (stack_valid(out_ao_offset)) { stack_store_float(stack, out_ao_offset, ao); diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h index 899e466d099..26dec9717b3 100644 --- a/intern/cycles/kernel/svm/svm_aov.h +++ b/intern/cycles/kernel/svm/svm_aov.h @@ -14,36 +14,50 @@ * limitations under the License. */ +#include "kernel/kernel_write_passes.h" + CCL_NAMESPACE_BEGIN -ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state, - ccl_global float *buffer) +ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer) { - int path_flag = state->flag; - bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)); - return ((buffer != NULL) && is_primary); + return ((render_buffer != NULL) && is_primary); } -ccl_device void svm_node_aov_color( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer) +ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *sd, + float *stack, + uint4 node, + ccl_global float *render_buffer) { float3 val = stack_load_float3(stack, node.y); - if (buffer) { - kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z, - make_float4(val.x, val.y, val.z, 1.0f)); + if (render_buffer && !INTEGRATOR_STATE_IS_NULL) { + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + render_buffer_offset + + (kernel_data.film.pass_aov_color + node.z); + kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z)); } } -ccl_device void svm_node_aov_value( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer) +ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS, + ShaderData *sd, + float *stack, + uint4 node, + ccl_global float *render_buffer) { float val = stack_load_float(stack, node.y); - if (buffer) { - kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val); + if (render_buffer && !INTEGRATOR_STATE_IS_NULL) { + const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index); + const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * + kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + render_buffer_offset + + (kernel_data.film.pass_aov_value + node.z); + kernel_write_pass_float(buffer, val); } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 62740824ad1..5f94b20af73 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN /* Attribute Node */ -ccl_device AttributeDescriptor svm_node_attr_init( - KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset) +ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg, + ShaderData *sd, + uint4 node, + NodeAttributeOutputType *type, + uint *out_offset) { *out_offset = node.z; *type = (NodeAttributeOutputType)node.w; @@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init( return desc; } -ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +template<uint node_feature_mask> +ccl_device_noinline void svm_node_attr(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT; uint out_offset = 0; AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset); #ifdef __VOLUME__ - /* Volumes - * NOTE: moving this into its own node type might help improve performance. */ - if (primitive_is_volume_attribute(sd, desc)) { - const float4 value = volume_attribute_float4(kg, sd, desc); + if (KERNEL_NODES_FEATURE(VOLUME)) { + /* Volumes + * NOTE: moving this into its own node type might help improve performance. */ + if (primitive_is_volume_attribute(sd, desc)) { + const float4 value = volume_attribute_float4(kg, sd, desc); - if (type == NODE_ATTR_OUTPUT_FLOAT) { - const float f = volume_attribute_value_to_float(value); - stack_store_float(stack, out_offset, f); - } - else if (type == NODE_ATTR_OUTPUT_FLOAT3) { - const float3 f = volume_attribute_value_to_float3(value); - stack_store_float3(stack, out_offset, f); + if (type == NODE_ATTR_OUTPUT_FLOAT) { + const float f = volume_attribute_value_to_float(value); + stack_store_float(stack, out_offset, f); + } + else if (type == NODE_ATTR_OUTPUT_FLOAT3) { + const float3 f = volume_attribute_value_to_float3(value); + stack_store_float3(stack, out_offset, f); + } + else { + const float f = volume_attribute_value_to_alpha(value); + stack_store_float(stack, out_offset, f); + } + return; } - else { - const float f = volume_attribute_value_to_alpha(value); - stack_store_float(stack, out_offset, f); - } - return; } #endif @@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u } } -ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT; uint out_offset = 0; @@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float * } } -ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT; uint out_offset = 0; diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h index bf5957ec9e4..aab089d19ea 100644 --- a/intern/cycles/kernel/svm/svm_bevel.h +++ b/intern/cycles/kernel/svm/svm_bevel.h @@ -14,21 +14,95 @@ * limitations under the License. */ +#include "kernel/bvh/bvh.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_random.h" + CCL_NAMESPACE_BEGIN #ifdef __SHADER_RAYTRACE__ +/* Planar Cubic BSSRDF falloff, reused for bevel. + * + * This is basically (Rm - x)^3, with some factors to normalize it. For sampling + * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as + * far as I can tell has no closed form solution. So we get an iterative solution + * instead with newton-raphson. */ + +ccl_device float svm_bevel_cubic_eval(const float radius, float r) +{ + const float Rm = radius; + + if (r >= Rm) + return 0.0f; + + /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */ + const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm; + const float f = Rm - r; + const float num = f * f * f; + + return (10.0f * num) / (Rm5 * M_PI_F); +} + +ccl_device float svm_bevel_cubic_pdf(const float radius, float r) +{ + return svm_bevel_cubic_eval(radius, r); +} + +/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ +ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi) +{ + /* newton-raphson iteration, usually succeeds in 2-4 iterations, except + * outside 0.02 ... 0.98 where it can go up to 10, so overall performance + * should not be too bad */ + const float tolerance = 1e-6f; + const int max_iteration_count = 10; + float x = 0.25f; + int i; + + for (i = 0; i < max_iteration_count; i++) { + float x2 = x * x; + float x3 = x2 * x; + float nx = (1.0f - x); + + float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi; + float f_ = 20.0f * (x * nx) * (nx * nx); + + if (fabsf(f) < tolerance || f_ == 0.0f) + break; + + x = saturate(x - f / f_); + } + + return x; +} + +ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h) +{ + float Rm = radius; + float r_ = svm_bevel_cubic_quintic_root_find(xi); + + r_ *= Rm; + *r = r_; + + /* h^2 + r^2 = Rm^2 */ + *h = safe_sqrtf(Rm * Rm - r_ * r_); +} + /* Bevel shader averaging normals from nearby surfaces. * * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013 * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf */ -ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, - ShaderData *sd, - ccl_addr_space PathState *state, - float radius, - int num_samples) +# ifdef __KERNEL_OPTIX__ +extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, +# else +ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS, +# endif + ShaderData *sd, + float radius, + int num_samples) { /* Early out if no sampling needed. */ if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) { @@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, } /* Don't bevel for blurry indirect rays. */ - if (state->min_ray_pdf < 8.0f) { + if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) { return sd->N; } /* Setup for multi intersection. */ LocalIntersection isect; - uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e); + uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash), + INTEGRATOR_STATE(path, rng_offset), + INTEGRATOR_STATE(path, sample), + 0x64c6a40e); /* Sample normals from surrounding points on surface. */ float3 sum_N = make_float3(0.0f, 0.0f, 0.0f); + /* TODO: support ray-tracing in shadow shader evaluation? */ + RNGState rng_state; + path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state); + for (int sample = 0; sample < num_samples; sample++) { float disk_u, disk_v; - path_branched_rng_2D( - kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v); + path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v); /* Pick random axis in local frame and point on disk. */ float3 disk_N, disk_T, disk_B; @@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, float disk_height; /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */ - bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height); + svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height); float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B; @@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, ray->P = sd->P + disk_N * disk_height + disk_P; ray->D = -disk_N; ray->t = 2.0f * disk_height; - ray->dP = sd->dP; - ray->dD = differential3_zero(); + ray->dP = differential_zero_compact(); + ray->dD = differential_zero_compact(); ray->time = sd->time; /* Intersect with the same object. if multiple intersections are found it @@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, /* Quickly retrieve P and Ng without setting up ShaderData. */ float3 hit_P; if (sd->type & PRIMITIVE_TRIANGLE) { - hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray); + hit_P = triangle_refine_local( + kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim); } # ifdef __OBJECT_MOTION__ else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) { float3 verts[3]; motion_triangle_vertices( kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts); - hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts); + hit_P = motion_triangle_refine_local( + kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts); } # endif /* __OBJECT_MOTION__ */ @@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, float r = len(hit_P - sd->P); /* Compute weight. */ - float pdf = bssrdf_cubic_pdf(radius, 0.0f, r); - float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r); + float pdf = svm_bevel_cubic_pdf(radius, r); + float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r); w *= pdf / disk_pdf; @@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N; } -ccl_device void svm_node_bevel( - KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node) +template<uint node_feature_mask> +# if defined(__KERNEL_OPTIX__) +ccl_device_inline +# else +ccl_device_noinline +# endif + void + svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node) { uint num_samples, radius_offset, normal_offset, out_offset; svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset); float radius = stack_load_float(stack, radius_offset); - float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples); - if (stack_valid(normal_offset)) { - /* Preserve input normal. */ - float3 ref_N = stack_load_float3(stack, normal_offset); - bevel_N = normalize(ref_N + (bevel_N - sd->N)); + float3 bevel_N = sd->N; + + if (KERNEL_NODES_FEATURE(RAYTRACE)) { +# ifdef __KERNEL_OPTIX__ + bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples); +# else + bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples); +# endif + + if (stack_valid(normal_offset)) { + /* Preserve input normal. */ + float3 ref_N = stack_load_float3(stack, normal_offset); + bevel_N = normalize(ref_N + (bevel_N - sd->N)); + } } stack_store_float3(stack, out_offset, bevel_N); diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index adfc50d961e..96b3703b954 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN /* Blackbody Node */ -ccl_device void svm_node_blackbody( - KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset) +ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint temperature_offset, + uint col_offset) { /* Input */ float temperature = stack_load_float(stack, temperature_offset); diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h index 6984afa30a5..dca1b220dd5 100644 --- a/intern/cycles/kernel/svm/svm_brick.h +++ b/intern/cycles/kernel/svm/svm_brick.h @@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p, return make_float2(tint, mortar); } -ccl_device void svm_node_tex_brick( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_brick( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { - uint4 node2 = read_node(kg, offset); - uint4 node3 = read_node(kg, offset); - uint4 node4 = read_node(kg, offset); + uint4 node2 = read_node(kg, &offset); + uint4 node3 = read_node(kg, &offset); + uint4 node4 = read_node(kg, &offset); /* Input and Output Sockets */ uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset; @@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick( stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f); if (stack_valid(fac_offset)) stack_store_float(stack, fac_offset, f); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h index 9554b5946fb..2ed812acd71 100644 --- a/intern/cycles/kernel/svm/svm_brightness.h +++ b/intern/cycles/kernel/svm/svm_brightness.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_brightness( +ccl_device_noinline void svm_node_brightness( ShaderData *sd, float *stack, uint in_color, uint out_color, uint node) { uint bright_offset, contrast_offset; diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h index c9d430a2bba..8672839dbab 100644 --- a/intern/cycles/kernel/svm/svm_bump.h +++ b/intern/cycles/kernel/svm/svm_bump.h @@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN /* Bump Eval Nodes */ -ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint offset) +ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint offset) { /* save state */ stack_store_float3(stack, offset + 0, sd->P); @@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, } } -ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint offset) +ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint offset) { /* restore state */ sd->P = stack_load_float3(stack, offset + 0); diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 21a17acf5f1..40c0edcdad0 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -16,12 +16,12 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_camera(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint out_vector, - uint out_zdepth, - uint out_distance) +ccl_device_noinline void svm_node_camera(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint out_vector, + uint out_zdepth, + uint out_distance) { float distance; float zdepth; diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h index d54cb73df91..a9919c9ddc9 100644 --- a/intern/cycles/kernel/svm/svm_checker.h +++ b/intern/cycles/kernel/svm/svm_checker.h @@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p) return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f; } -ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint co_offset, color1_offset, color2_offset, scale_offset; uint color_offset, fac_offset; diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h index a85fd82754e..656bd31c085 100644 --- a/intern/cycles/kernel/svm/svm_clamp.h +++ b/intern/cycles/kernel/svm/svm_clamp.h @@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN /* Clamp Node */ -ccl_device void svm_node_clamp(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint value_stack_offset, - uint parameters_stack_offsets, - uint result_stack_offset, - int *offset) +ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint value_stack_offset, + uint parameters_stack_offsets, + uint result_stack_offset, + int offset) { uint min_stack_offset, max_stack_offset, type; svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type); - uint4 defaults = read_node(kg, offset); + uint4 defaults = read_node(kg, &offset); float value = stack_load_float(stack, value_stack_offset); float min = stack_load_float_default(stack, min_stack_offset, defaults.x); @@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg, else { stack_store_float(stack, result_stack_offset, clamp(value, min, max)); } + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index bbe8d72edf0..e2f6dde4ace 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup( } } -ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint4 node, - ShaderType shader_type, - int path_flag, - int *offset) +template<uint node_feature_mask, ShaderType shader_type> +ccl_device_noinline int svm_node_closure_bsdf( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset) { uint type, param1_offset, param2_offset; @@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, 1.0f); /* note we read this extra node before weight check, so offset is added */ - uint4 data_node = read_node(kg, offset); + uint4 data_node = read_node(kg, &offset); /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */ - if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) { + if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) { if (type == CLOSURE_BSDF_PRINCIPLED_ID) { /* Read all principled BSDF extra data to get the right offset. */ - read_node(kg, offset); - read_node(kg, offset); - read_node(kg, offset); - read_node(kg, offset); + read_node(kg, &offset); + read_node(kg, &offset); + read_node(kg, &offset); + read_node(kg, &offset); } - return; + return offset; } float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N; @@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset; - uint4 data_node2 = read_node(kg, offset); + uint4 data_node2 = read_node(kg, &offset); float3 T = stack_load_float3(stack, data_node.y); svm_unpack_node_uchar4(data_node.z, @@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, float specular_weight = (1.0f - final_transmission); // get the base color - uint4 data_base_color = read_node(kg, offset); + uint4 data_base_color = read_node(kg, &offset); float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) : make_float3(__uint_as_float(data_base_color.y), @@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, __uint_as_float(data_base_color.w)); // get the additional clearcoat normal and subsurface scattering radius - uint4 data_cn_ssr = read_node(kg, offset); + uint4 data_cn_ssr = read_node(kg, &offset); float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); + float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) : + 1.4f; + float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ? + stack_load_float(stack, data_cn_ssr.w) : + 0.0f; // get the subsurface color - uint4 data_subsurface_color = read_node(kg, offset); + uint4 data_subsurface_color = read_node(kg, &offset); float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : make_float3(__uint_as_float(data_subsurface_color.y), @@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, if (bssrdf) { bssrdf->radius = subsurface_radius * subsurface; - bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ? - subsurface_color : - mixed_ss_base_color; - bssrdf->texture_blur = 0.0f; - bssrdf->sharpness = 0.0f; + bssrdf->albedo = mixed_ss_base_color; bssrdf->N = N; bssrdf->roughness = roughness; + /* Clamps protecting against bad/extreme and non physical values. */ + subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f); + bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f); + /* setup bsdf */ - sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method); + sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior); } } } @@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, } #ifdef __HAIR__ case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: { - uint4 data_node2 = read_node(kg, offset); - uint4 data_node3 = read_node(kg, offset); - uint4 data_node4 = read_node(kg, offset); + uint4 data_node2 = read_node(kg, &offset); + uint4 data_node3 = read_node(kg, &offset); + uint4 data_node4 = read_node(kg, &offset); float3 weight = sd->svm_closure_weight * mix_weight; @@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, #endif /* __HAIR__ */ #ifdef __SUBSURFACE__ - case CLOSURE_BSSRDF_CUBIC_ID: - case CLOSURE_BSSRDF_GAUSSIAN_ID: - case CLOSURE_BSSRDF_BURLEY_ID: - case CLOSURE_BSSRDF_RANDOM_WALK_ID: { + case CLOSURE_BSSRDF_RANDOM_WALK_ID: + case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: { float3 weight = sd->svm_closure_weight * mix_weight; Bssrdf *bssrdf = bssrdf_alloc(sd, weight); @@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, bssrdf->radius = stack_load_float3(stack, data_node.z) * param1; bssrdf->albedo = sd->svm_closure_weight; - bssrdf->texture_blur = param2; - bssrdf->sharpness = stack_load_float(stack, data_node.w); bssrdf->N = N; - bssrdf->roughness = 0.0f; - sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type); + bssrdf->roughness = FLT_MAX; + + const float subsurface_ior = clamp(param2, 1.01f, 3.8f); + const float subsurface_anisotropy = stack_load_float(stack, data_node.w); + bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f); + + sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior); } break; @@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, default: break; } + + return offset; } -ccl_device void svm_node_closure_volume( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type) +template<ShaderType shader_type> +ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { #ifdef __VOLUME__ /* Only sum extinction for volumes, variable is shared with surface transparency. */ @@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume( #endif } -ccl_device void svm_node_principled_volume(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint4 node, - ShaderType shader_type, - int path_flag, - int *offset) +template<ShaderType shader_type> +ccl_device_noinline int svm_node_principled_volume( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset) { #ifdef __VOLUME__ - uint4 value_node = read_node(kg, offset); - uint4 attr_node = read_node(kg, offset); + uint4 value_node = read_node(kg, &offset); + uint4 attr_node = read_node(kg, &offset); /* Only sum extinction for volumes, variable is shared with surface transparency. */ if (shader_type != SHADER_TYPE_VOLUME) { - return; + return offset; } uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset; @@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg, 1.0f); if (mix_weight == 0.0f) { - return; + return offset; } /* Compute density. */ @@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg, /* Compute emission. */ if (path_flag & PATH_RAY_SHADOW) { /* Don't need emission for shadows. */ - return; + return offset; } uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset; @@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg, } } #endif + return offset; } -ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node) { uint mix_weight_offset = node.y; float3 weight = sd->svm_closure_weight; @@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no emission_setup(sd, weight); } -ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) { uint mix_weight_offset = node.y; float3 weight = sd->svm_closure_weight; @@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 background_setup(sd, weight); } -ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) { uint mix_weight_offset = node.y; @@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset) { float3 weight = stack_load_float3(stack, weight_offset); - svm_node_closure_store_weight(sd, weight); } -ccl_device void svm_node_emission_weight(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint4 node) +ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint color_offset = node.y; uint strength_offset = node.z; @@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, svm_node_closure_store_weight(sd, weight); } -ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) { /* fetch weight from blend input, previous mix closures, * and write to stack to be used by closure nodes later */ @@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) /* (Bump) normal */ ccl_device void svm_node_set_normal( - KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) + const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); sd->N = normal; diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h index 5df6c9fb755..37d40167ccc 100644 --- a/intern/cycles/kernel/svm/svm_convert.h +++ b/intern/cycles/kernel/svm/svm_convert.h @@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN /* Conversion Nodes */ -ccl_device void svm_node_convert( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to) +ccl_device_noinline void svm_node_convert( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to) { switch (type) { case NODE_CONVERT_FI: { diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 250fac6bcb8..a1d952173d8 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -14,11 +14,16 @@ * limitations under the License. */ +#include "kernel/kernel_montecarlo.h" + CCL_NAMESPACE_BEGIN /* Bump Node */ -ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { #ifdef __RAY_DIFFERENTIALS__ /* get normal input */ @@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac /* Displacement Node */ -ccl_device void svm_node_set_displacement(KernelGlobals *kg, +ccl_device void svm_node_set_displacement(const KernelGlobals *kg, ShaderData *sd, float *stack, uint fac_offset) @@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, sd->P += dP; } -ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint height_offset, midlevel_offset, scale_offset, normal_offset; svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset); @@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float * stack_store_float3(stack, node.z, dP); } -ccl_device void svm_node_vector_displacement( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_vector_displacement( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { - uint4 data_node = read_node(kg, offset); + uint4 data_node = read_node(kg, &offset); uint space = data_node.x; uint vector_offset, midlevel_offset, scale_offset, displacement_offset; @@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement( } stack_store_float3(stack, displacement_offset, dP); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 96d602e35bf..b5ecdbe2abf 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN /* Fresnel Node */ -ccl_device void svm_node_fresnel( +ccl_device_noinline void svm_node_fresnel( ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node) { uint normal_offset, out_offset; @@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel( /* Layer Weight Node */ -ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) { uint blend_offset = node.y; uint blend_value = node.z; diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h index 65eb08eb0eb..f6fafdee941 100644 --- a/intern/cycles/kernel/svm/svm_gamma.h +++ b/intern/cycles/kernel/svm/svm_gamma.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_gamma( +ccl_device_noinline void svm_node_gamma( ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color) { float3 color = stack_load_float3(stack, in_color); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index e48e96dcfa4..10e9f291d0e 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN /* Geometry Node */ -ccl_device_inline void svm_node_geometry( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_geometry( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { float3 data; @@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry( stack_store_float3(stack, out_offset, data); } -ccl_device void svm_node_geometry_bump_dx( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_geometry_bump_dx( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx( #endif } -ccl_device void svm_node_geometry_bump_dy( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_geometry_bump_dy( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy( /* Object Info */ -ccl_device void svm_node_object_info( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_object_info( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { float data; @@ -140,8 +140,8 @@ ccl_device void svm_node_object_info( /* Particle Info */ -ccl_device void svm_node_particle_info( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_particle_info( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { switch (type) { case NODE_INFO_PAR_INDEX: { @@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info( /* Hair Info */ -ccl_device void svm_node_hair_info( - KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) +ccl_device_noinline void svm_node_hair_info( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset) { float data; float3 data3; diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h index 08304bc47e8..cd15f7097e7 100644 --- a/intern/cycles/kernel/svm/svm_gradient.h +++ b/intern/cycles/kernel/svm/svm_gradient.h @@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type) return 0.0f; } -ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node) { uint type, co_offset, color_offset, fac_offset; diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h index c299cf58c7f..6f49a8385aa 100644 --- a/intern/cycles/kernel/svm/svm_hsv.h +++ b/intern/cycles/kernel/svm/svm_hsv.h @@ -19,8 +19,10 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_hsv( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint in_color_offset, fac_offset, out_color_offset; uint hue_offset, sat_offset, val_offset; diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h index 56c804b44d0..9c13734ecf0 100644 --- a/intern/cycles/kernel/svm/svm_ies.h +++ b/intern/cycles/kernel/svm/svm_ies.h @@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN /* IES Light */ ccl_device_inline float interpolate_ies_vertical( - KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h) + const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h) { /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing @@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical( return cubic_interp(a, b, c, d, v_frac); } -ccl_device_inline float kernel_ies_interp(KernelGlobals *kg, +ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg, int slot, float h_angle, float v_angle) @@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg, return max(cubic_interp(a, b, c, d, h_frac), 0.0f); } -ccl_device void svm_node_ies( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline void svm_node_ies(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint vector_offset, strength_offset, fac_offset, slot = node.z; svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 9348ddabde5..a344f36977a 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags) +ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags) { if (id == -1) { return make_float4( @@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co) return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f; } -ccl_device void svm_node_tex_image( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_image( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { uint co_offset, out_offset, alpha_offset, flags; @@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image( int num_nodes = (int)node.y; if (num_nodes > 0) { /* Remember the offset of the node following the tile nodes. */ - int next_offset = (*offset) + num_nodes; + int next_offset = offset + num_nodes; /* Find the tile that the UV lies in. */ int tx = (int)tex_co.x; @@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image( /* Find the index of the tile. */ for (int i = 0; i < num_nodes; i++) { - uint4 tile_node = read_node(kg, offset); + uint4 tile_node = read_node(kg, &offset); if (tile_node.x == tile) { id = tile_node.y; break; @@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image( } /* Skip over the remaining nodes. */ - *offset = next_offset; + offset = next_offset; } else { id = -num_nodes; @@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image( stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z)); if (stack_valid(alpha_offset)) stack_store_float(stack, alpha_offset, f.w); + return offset; } -ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { /* get object space normal */ float3 N = sd->N; @@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float stack_store_float(stack, alpha_offset, f.w); } -ccl_device void svm_node_tex_environment(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint4 node) +ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint id = node.y; uint co_offset, out_offset, alpha_offset, flags; diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h index 02024742b13..27cdaaff473 100644 --- a/intern/cycles/kernel/svm/svm_invert.h +++ b/intern/cycles/kernel/svm/svm_invert.h @@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor) return factor * (1.0f - color) + (1.0f - factor) * color; } -ccl_device void svm_node_invert( +ccl_device_noinline void svm_node_invert( ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color) { float factor = stack_load_float(stack, in_fac); diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index 768c65918cd..49fabad1cc5 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN /* Light Path Node */ -ccl_device void svm_node_light_path(ShaderData *sd, - ccl_addr_space PathState *state, - float *stack, - uint type, - uint out_offset, - int path_flag) +ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS, + const ShaderData *sd, + float *stack, + uint type, + uint out_offset, + int path_flag) { float info = 0.0f; @@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd, case NODE_LP_ray_length: info = sd->ray_length; break; - case NODE_LP_ray_depth: - info = (float)state->bounce; + case NODE_LP_ray_depth: { + /* Read bounce from difference location depending if this is a shadow + * path. It's a bit dubious to have integrate state details leak into + * this function but hard to avoid currently. */ + int bounce = (INTEGRATOR_STATE_IS_NULL) ? 0 : + (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) : + INTEGRATOR_STATE(path, bounce); + + /* For background, light emission and shadow evaluation we from a + * surface or volume we are effective one bounce further. */ + if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) { + bounce++; + } + + info = (float)bounce; break; + } + /* TODO */ + case NODE_LP_ray_transparent: { + const int bounce = (INTEGRATOR_STATE_IS_NULL) ? + 0 : + (path_flag & PATH_RAY_SHADOW) ? + INTEGRATOR_STATE(shadow_path, transparent_bounce) : + INTEGRATOR_STATE(path, transparent_bounce); + + info = (float)bounce; + break; + } +#if 0 case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break; case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break; - case NODE_LP_ray_transparent: - info = (float)state->transparent_bounce; - break; +#endif +#if 0 case NODE_LP_ray_transmission: info = (float)state->transmission_bounce; break; +#endif } stack_store_float(stack, out_offset, info); @@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd, /* Light Falloff Node */ -ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) { uint strength_offset, out_offset, smooth_offset; diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h index 9c160e6d8cc..8784c760860 100644 --- a/intern/cycles/kernel/svm/svm_magic.h +++ b/intern/cycles/kernel/svm/svm_magic.h @@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion) return make_float3(0.5f - x, 0.5f - y, 0.5f - z); } -ccl_device void svm_node_tex_magic( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_magic( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { uint depth; uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset; @@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic( svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset); svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset); - uint4 node2 = read_node(kg, offset); + uint4 node2 = read_node(kg, &offset); float3 co = stack_load_float3(stack, co_offset); float scale = stack_load_float_default(stack, scale_offset, node2.x); float distortion = stack_load_float_default(stack, distortion_offset, node2.y); @@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic( stack_store_float(stack, fac_offset, average(color)); if (stack_valid(color_offset)) stack_store_float3(stack, color_offset, color); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h index 533a631c837..c8684981e31 100644 --- a/intern/cycles/kernel/svm/svm_map_range.h +++ b/intern/cycles/kernel/svm/svm_map_range.h @@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x) return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f); } -ccl_device void svm_node_map_range(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint value_stack_offset, - uint parameters_stack_offsets, - uint results_stack_offsets, - int *offset) +ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint value_stack_offset, + uint parameters_stack_offsets, + uint results_stack_offsets, + int offset) { uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset; uint type_stack_offset, steps_stack_offset, result_stack_offset; @@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg, svm_unpack_node_uchar3( results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset); - uint4 defaults = read_node(kg, offset); - uint4 defaults2 = read_node(kg, offset); + uint4 defaults = read_node(kg, &offset); + uint4 defaults2 = read_node(kg, &offset); float value = stack_load_float(stack, value_stack_offset); float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x); @@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg, result = 0.0f; } stack_store_float(stack, result_stack_offset, result); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h index 6e19c859e19..fcc724405f5 100644 --- a/intern/cycles/kernel/svm/svm_mapping.h +++ b/intern/cycles/kernel/svm/svm_mapping.h @@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN /* Mapping Node */ -ccl_device void svm_node_mapping(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint type, - uint inputs_stack_offsets, - uint result_stack_offset, - int *offset) +ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint inputs_stack_offsets, + uint result_stack_offset) { uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset; svm_unpack_node_uchar4(inputs_stack_offsets, @@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg, /* Texture Mapping */ -ccl_device void svm_node_texture_mapping( - KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset) +ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint vec_offset, + uint out_offset, + int offset) { float3 v = stack_load_float3(stack, vec_offset); Transform tfm; - tfm.x = read_node_float(kg, offset); - tfm.y = read_node_float(kg, offset); - tfm.z = read_node_float(kg, offset); + tfm.x = read_node_float(kg, &offset); + tfm.y = read_node_float(kg, &offset); + tfm.z = read_node_float(kg, &offset); float3 r = transform_point(&tfm, v); stack_store_float3(stack, out_offset, r); + return offset; } -ccl_device void svm_node_min_max( - KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset) +ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint vec_offset, + uint out_offset, + int offset) { float3 v = stack_load_float3(stack, vec_offset); - float3 mn = float4_to_float3(read_node_float(kg, offset)); - float3 mx = float4_to_float3(read_node_float(kg, offset)); + float3 mn = float4_to_float3(read_node_float(kg, &offset)); + float3 mx = float4_to_float3(read_node_float(kg, &offset)); float3 r = min(max(mn, v), mx); stack_store_float3(stack, out_offset, r); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h index 733ea28f9e5..99e7a8f2bda 100644 --- a/intern/cycles/kernel/svm/svm_math.h +++ b/intern/cycles/kernel/svm/svm_math.h @@ -16,13 +16,12 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_math(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint type, - uint inputs_stack_offsets, - uint result_stack_offset, - int *offset) +ccl_device_noinline void svm_node_math(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint inputs_stack_offsets, + uint result_stack_offset) { uint a_stack_offset, b_stack_offset, c_stack_offset; svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset); @@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg, stack_store_float(stack, result_stack_offset, result); } -ccl_device void svm_node_vector_math(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint type, - uint inputs_stack_offsets, - uint outputs_stack_offsets, - int *offset) +ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint type, + uint inputs_stack_offsets, + uint outputs_stack_offsets, + int offset) { uint value_stack_offset, vector_stack_offset; uint a_stack_offset, b_stack_offset, param1_stack_offset; @@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg, /* 3 Vector Operators */ if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD || type == NODE_VECTOR_MATH_MULTIPLY_ADD) { - uint4 extra_node = read_node(kg, offset); + uint4 extra_node = read_node(kg, &offset); c = stack_load_float3(stack, extra_node.x); } @@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg, stack_store_float(stack, value_stack_offset, value); if (stack_valid(vector_stack_offset)) stack_store_float3(stack, vector_stack_offset, vector); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h index 15114bfd5e4..3e38080977f 100644 --- a/intern/cycles/kernel/svm/svm_mix.h +++ b/intern/cycles/kernel/svm/svm_mix.h @@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN /* Node */ -ccl_device void svm_node_mix(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint fac_offset, - uint c1_offset, - uint c2_offset, - int *offset) +ccl_device_noinline int svm_node_mix(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint fac_offset, + uint c1_offset, + uint c2_offset, + int offset) { /* read extra data */ - uint4 node1 = read_node(kg, offset); + uint4 node1 = read_node(kg, &offset); float fac = stack_load_float(stack, fac_offset); float3 c1 = stack_load_float3(stack, c1_offset); @@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg, float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2); stack_store_float3(stack, node1.z, result); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h index 571f62fe27f..03a8b68b3ef 100644 --- a/intern/cycles/kernel/svm/svm_musgrave.h +++ b/intern/cycles/kernel/svm/svm_musgrave.h @@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d( return value; } -ccl_device void svm_node_tex_musgrave(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint offsets1, - uint offsets2, - uint offsets3, - int *offset) +ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint offsets1, + uint offsets2, + uint offsets3, + int offset) { uint type, dimensions, co_stack_offset, w_stack_offset; uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset; @@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg, &lacunarity_stack_offset); svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset); - uint4 defaults1 = read_node(kg, offset); - uint4 defaults2 = read_node(kg, offset); + uint4 defaults1 = read_node(kg, &offset); + uint4 defaults2 = read_node(kg, &offset); float3 co = stack_load_float3(stack, co_stack_offset); float w = stack_load_float_default(stack, w_stack_offset, defaults1.x); @@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg, } stack_store_float(stack, fac_stack_offset, fac); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 94d8bfde555..ecb4df6afdf 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y) * |__________________________| * */ -ccl_device_noinline float perlin_2d(float x, float y) +ccl_device_noinline_cpu float perlin_2d(float x, float y) { ssei XY; ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY); @@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f) * v7 (1, 1, 1) * */ -ccl_device_noinline float perlin_3d(float x, float y, float z) +ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { ssei XYZ; ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); @@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z) * v15 (1, 1, 1, 1) * */ -ccl_device_noinline float perlin_4d(float x, float y, float z, float w) +ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { ssei XYZW; ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); @@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) * |__________________________| * */ -ccl_device_noinline float perlin_3d(float x, float y, float z) +ccl_device_noinline_cpu float perlin_3d(float x, float y, float z) { ssei XYZ; ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ); @@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z) * v15 (1, 1, 1, 1) * */ -ccl_device_noinline float perlin_4d(float x, float y, float z, float w) +ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w) { ssei XYZW; ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW); diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h index 61fd9553802..29b262ac06e 100644 --- a/intern/cycles/kernel/svm/svm_noisetex.h +++ b/intern/cycles/kernel/svm/svm_noisetex.h @@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co, } } -ccl_device void svm_node_tex_noise(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint dimensions, - uint offsets1, - uint offsets2, - int *offset) +ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint dimensions, + uint offsets1, + uint offsets2, + int offset) { uint vector_stack_offset, w_stack_offset, scale_stack_offset; uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset; @@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg, &value_stack_offset, &color_stack_offset); - uint4 defaults1 = read_node(kg, offset); - uint4 defaults2 = read_node(kg, offset); + uint4 defaults1 = read_node(kg, &offset); + uint4 defaults2 = read_node(kg, &offset); float3 vector = stack_load_float3(stack, vector_stack_offset); float w = stack_load_float_default(stack, w_stack_offset, defaults1.x); @@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg, if (stack_valid(color_stack_offset)) { stack_store_float3(stack, color_stack_offset, color); } + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h index 4cd3eab0ed2..724b5f281f9 100644 --- a/intern/cycles/kernel/svm/svm_normal.h +++ b/intern/cycles/kernel/svm/svm_normal.h @@ -16,16 +16,16 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_normal(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint in_normal_offset, - uint out_normal_offset, - uint out_dot_offset, - int *offset) +ccl_device_noinline int svm_node_normal(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint in_normal_offset, + uint out_normal_offset, + uint out_dot_offset, + int offset) { /* read extra data */ - uint4 node1 = read_node(kg, offset); + uint4 node1 = read_node(kg, &offset); float3 normal = stack_load_float3(stack, in_normal_offset); float3 direction; @@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg, if (stack_valid(out_dot_offset)) stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal))); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index 85ccf39144b..e92df3c093c 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ -ccl_device_inline float4 rgb_ramp_lookup( - KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size) +ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg, + int offset, + float f, + bool interpolate, + bool extrapolate, + int table_size) { if ((f < 0.0f || f > 1.0f) && extrapolate) { float4 t0, dy; @@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup( return a; } -ccl_device void svm_node_rgb_ramp( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_rgb_ramp( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { uint fac_offset, color_offset, alpha_offset; uint interpolate = node.z; svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset); - uint table_size = read_node(kg, offset).x; + uint table_size = read_node(kg, &offset).x; float fac = stack_load_float(stack, fac_offset); - float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size); + float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size); if (stack_valid(color_offset)) stack_store_float3(stack, color_offset, float4_to_float3(color)); if (stack_valid(alpha_offset)) stack_store_float(stack, alpha_offset, color.w); - *offset += table_size; + offset += table_size; + return offset; } -ccl_device void svm_node_curves( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_curves( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { uint fac_offset, color_offset, out_offset; svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset); - uint table_size = read_node(kg, offset).x; + uint table_size = read_node(kg, &offset).x; float fac = stack_load_float(stack, fac_offset); float3 color = stack_load_float3(stack, color_offset); @@ -89,14 +94,15 @@ ccl_device void svm_node_curves( const float range_x = max_x - min_x; const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x; - float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x; - float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y; - float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z; + float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x; + float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y; + float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z; color = (1.0f - fac) * color + fac * make_float3(r, g, b); stack_store_float3(stack, out_offset, color); - *offset += table_size; + offset += table_size; + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h index f501252062e..8d52845ea3d 100644 --- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h +++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h @@ -16,15 +16,15 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_combine_hsv(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint hue_in, - uint saturation_in, - uint value_in, - int *offset) +ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint hue_in, + uint saturation_in, + uint value_in, + int offset) { - uint4 node1 = read_node(kg, offset); + uint4 node1 = read_node(kg, &offset); uint color_out = node1.y; float hue = stack_load_float(stack, hue_in); @@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, if (stack_valid(color_out)) stack_store_float3(stack, color_out, color); + return offset; } -ccl_device void svm_node_separate_hsv(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint color_in, - uint hue_out, - uint saturation_out, - int *offset) +ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint color_in, + uint hue_out, + uint saturation_out, + int offset) { - uint4 node1 = read_node(kg, offset); + uint4 node1 = read_node(kg, &offset); uint value_out = node1.y; float3 color = stack_load_float3(stack, color_in); @@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, stack_store_float(stack, saturation_out, color.y); if (stack_valid(value_out)) stack_store_float(stack, value_out, color.z); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h index b908732f026..b77c4311e72 100644 --- a/intern/cycles/kernel/svm/svm_sky.h +++ b/intern/cycles/kernel/svm/svm_sky.h @@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma) (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma); } -ccl_device float3 sky_radiance_preetham(KernelGlobals *kg, +ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg, float3 dir, float sunphi, float suntheta, @@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float configuration[6] * mieM + configuration[7] * zenith); } -ccl_device float3 sky_radiance_hosek(KernelGlobals *kg, +ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg, float3 dir, float sunphi, float suntheta, @@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon) return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)); } -ccl_device float3 sky_radiance_nishita(KernelGlobals *kg, +ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg, float3 dir, float *nishita_data, uint texture_id) @@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg, return xyz_to_rgb(kg, xyz); } -ccl_device void svm_node_tex_sky( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_sky( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { /* Load data */ uint dir_offset = node.y; @@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky( float sunphi, suntheta, radiance_x, radiance_y, radiance_z; float config_x[9], config_y[9], config_z[9]; - float4 data = read_node_float(kg, offset); + float4 data = read_node_float(kg, &offset); sunphi = data.x; suntheta = data.y; radiance_x = data.z; radiance_y = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); radiance_z = data.x; config_x[0] = data.y; config_x[1] = data.z; config_x[2] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_x[3] = data.x; config_x[4] = data.y; config_x[5] = data.z; config_x[6] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_x[7] = data.x; config_x[8] = data.y; config_y[0] = data.z; config_y[1] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_y[2] = data.x; config_y[3] = data.y; config_y[4] = data.z; config_y[5] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_y[6] = data.x; config_y[7] = data.y; config_y[8] = data.z; config_z[0] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_z[1] = data.x; config_z[2] = data.y; config_z[3] = data.z; config_z[4] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); config_z[5] = data.x; config_z[6] = data.y; config_z[7] = data.z; @@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky( /* Define variables */ float nishita_data[10]; - float4 data = read_node_float(kg, offset); + float4 data = read_node_float(kg, &offset); nishita_data[0] = data.x; nishita_data[1] = data.y; nishita_data[2] = data.z; nishita_data[3] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); nishita_data[4] = data.x; nishita_data[5] = data.y; nishita_data[6] = data.z; nishita_data[7] = data.w; - data = read_node_float(kg, offset); + data = read_node_float(kg, &offset); nishita_data[8] = data.x; nishita_data[9] = data.y; uint texture_id = __float_as_uint(data.z); @@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky( } stack_store_float3(stack, out_offset, f); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index 46600551cc4..a35253080da 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -14,12 +14,16 @@ * limitations under the License. */ +#include "kernel/geom/geom.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernel_montecarlo.h" + CCL_NAMESPACE_BEGIN /* Texture Coordinate Node */ -ccl_device void svm_node_tex_coord( - KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_coord( + const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset) { float3 data; uint type = node.y; @@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord( } else { Transform tfm; - tfm.x = read_node_float(kg, offset); - tfm.y = read_node_float(kg, offset); - tfm.z = read_node_float(kg, offset); + tfm.x = read_node_float(kg, &offset); + tfm.y = read_node_float(kg, &offset); + tfm.z = read_node_float(kg, &offset); data = transform_point(&tfm, data); } break; @@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord( } stack_store_float3(stack, out_offset, data); + return offset; } -ccl_device void svm_node_tex_coord_bump_dx( - KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_coord_bump_dx( + const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx( } else { Transform tfm; - tfm.x = read_node_float(kg, offset); - tfm.y = read_node_float(kg, offset); - tfm.z = read_node_float(kg, offset); + tfm.x = read_node_float(kg, &offset); + tfm.y = read_node_float(kg, &offset); + tfm.z = read_node_float(kg, &offset); data = transform_point(&tfm, data); } break; @@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx( case NODE_TEXCO_WINDOW: { if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); + data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)); else data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; @@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx( } stack_store_float3(stack, out_offset, data); + return offset; #else - svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); + return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); #endif } -ccl_device void svm_node_tex_coord_bump_dy( - KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_coord_bump_dy( + const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset) { #ifdef __RAY_DIFFERENTIALS__ float3 data; @@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy( } else { Transform tfm; - tfm.x = read_node_float(kg, offset); - tfm.y = read_node_float(kg, offset); - tfm.z = read_node_float(kg, offset); + tfm.x = read_node_float(kg, &offset); + tfm.y = read_node_float(kg, &offset); + tfm.z = read_node_float(kg, &offset); data = transform_point(&tfm, data); } break; @@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy( case NODE_TEXCO_WINDOW: { if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); + data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)); else data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; @@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy( } stack_store_float3(stack, out_offset, data); + return offset; #else - svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); + return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); #endif } -ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint color_offset, strength_offset, normal_offset, space; svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space); @@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st stack_store_float3(stack, normal_offset, N); } -ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint tangent_offset, direction_type, axis; svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis); diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 062afcfa5ac..c053be96c51 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN /* Nodes */ -/* Known frequencies of used nodes, used for selective nodes compilation - * in the kernel. Currently only affects split OpenCL kernel. - * - * Keep as defines so it's easy to check which nodes are to be compiled - * from preprocessor. - * - * Lower the number of group more often the node is used. - */ -#define NODE_GROUP_LEVEL_0 0 -#define NODE_GROUP_LEVEL_1 1 -#define NODE_GROUP_LEVEL_2 2 -#define NODE_GROUP_LEVEL_3 3 -#define NODE_GROUP_LEVEL_4 4 -#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4 - -#define NODE_FEATURE_VOLUME (1 << 0) -#define NODE_FEATURE_HAIR (1 << 1) -#define NODE_FEATURE_BUMP (1 << 2) -#define NODE_FEATURE_BUMP_STATE (1 << 3) -#define NODE_FEATURE_VORONOI_EXTRA (1 << 4) -/* TODO(sergey): Consider using something like ((uint)(-1)). - * Need to check carefully operand types around usage of this - * define first. - */ -#define NODE_FEATURE_ALL \ - (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \ - NODE_FEATURE_VORONOI_EXTRA) - -#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__) -#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0) - typedef enum ShaderNodeType { NODE_END = 0, NODE_SHADER_JUMP, @@ -572,12 +541,8 @@ typedef enum ClosureType { CLOSURE_BSDF_TRANSPARENT_ID, /* BSSRDF */ - CLOSURE_BSSRDF_CUBIC_ID, - CLOSURE_BSSRDF_GAUSSIAN_ID, - CLOSURE_BSSRDF_PRINCIPLED_ID, - CLOSURE_BSSRDF_BURLEY_ID, CLOSURE_BSSRDF_RANDOM_WALK_ID, - CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID, + CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID, /* Other */ CLOSURE_HOLDOUT_ID, @@ -620,11 +585,9 @@ typedef enum ClosureType { type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \ type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \ type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) -#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) +#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) #define CLOSURE_IS_BSSRDF(type) \ - (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) -#define CLOSURE_IS_DISK_BSSRDF(type) \ - (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) + (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) #define CLOSURE_IS_VOLUME(type) \ (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) #define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h index 5b76f2c8832..d0478660094 100644 --- a/intern/cycles/kernel/svm/svm_value.h +++ b/intern/cycles/kernel/svm/svm_value.h @@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN /* Value Nodes */ ccl_device void svm_node_value_f( - KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset) + const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset) { stack_store_float(stack, out_offset, __uint_as_float(ivalue)); } -ccl_device void svm_node_value_v( - KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset) +ccl_device int svm_node_value_v( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset) { /* read extra data */ - uint4 node1 = read_node(kg, offset); + uint4 node1 = read_node(kg, &offset); float3 p = make_float3( __uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w)); stack_store_float3(stack, out_offset, p); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h index 50045752484..55e1bce0158 100644 --- a/intern/cycles/kernel/svm/svm_vector_rotate.h +++ b/intern/cycles/kernel/svm/svm_vector_rotate.h @@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN /* Vector Rotate */ -ccl_device void svm_node_vector_rotate(ShaderData *sd, - float *stack, - uint input_stack_offsets, - uint axis_stack_offsets, - uint result_stack_offset) +ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd, + float *stack, + uint input_stack_offsets, + uint axis_stack_offsets, + uint result_stack_offset) { uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset, angle_stack_offset, invert; diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 1e95492cf1b..8aedb7e0f54 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN /* Vector Transform */ -ccl_device void svm_node_vector_transform(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint4 node) +ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint itype, ifrom, ito; uint vector_in, vector_out; diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h index 0aa45835522..986ea244f3a 100644 --- a/intern/cycles/kernel/svm/svm_vertex_color.h +++ b/intern/cycles/kernel/svm/svm_vertex_color.h @@ -16,12 +16,12 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_vertex_color(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint layer_id, - uint color_offset, - uint alpha_offset) +ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint layer_id, + uint color_offset, + uint alpha_offset) { AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id); if (descriptor.offset != ATTR_STD_NOT_FOUND) { @@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg, } } -#ifndef __KERNEL_CUDA__ -ccl_device -#else -ccl_device_noinline -#endif - void - svm_node_vertex_color_bump_dx(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint layer_id, - uint color_offset, - uint alpha_offset) +ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint layer_id, + uint color_offset, + uint alpha_offset) { AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id); if (descriptor.offset != ATTR_STD_NOT_FOUND) { @@ -62,18 +56,12 @@ ccl_device_noinline } } -#ifndef __KERNEL_CUDA__ -ccl_device -#else -ccl_device_noinline -#endif - void - svm_node_vertex_color_bump_dy(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint layer_id, - uint color_offset, - uint alpha_offset) +ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint layer_id, + uint color_offset, + uint alpha_offset) { AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id); if (descriptor.offset != ATTR_STD_NOT_FOUND) { diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h index d0e7db35fab..b1d2eff7f37 100644 --- a/intern/cycles/kernel/svm/svm_voronoi.h +++ b/intern/cycles/kernel/svm/svm_voronoi.h @@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f; } -ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint dimensions, - uint feature, - uint metric, - int *offset) +template<uint node_feature_mask> +ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint dimensions, + uint feature, + uint metric, + int offset) { - uint4 stack_offsets = read_node(kg, offset); - uint4 defaults = read_node(kg, offset); + uint4 stack_offsets = read_node(kg, &offset); + uint4 defaults = read_node(kg, &offset); uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset; uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset, @@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, &color_out, &position_out_2d); break; -#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA) case NODE_VORONOI_SMOOTH_F1: - voronoi_smooth_f1_2d(coord_2d, - smoothness, - exponent, - randomness, - voronoi_metric, - &distance_out, - &color_out, - &position_out_2d); + if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) { + voronoi_smooth_f1_2d(coord_2d, + smoothness, + exponent, + randomness, + voronoi_metric, + &distance_out, + &color_out, + &position_out_2d); + } break; -#endif case NODE_VORONOI_F2: voronoi_f2_2d(coord_2d, exponent, @@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, &color_out, &position_out); break; -#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA) case NODE_VORONOI_SMOOTH_F1: - voronoi_smooth_f1_3d(coord, - smoothness, - exponent, - randomness, - voronoi_metric, - &distance_out, - &color_out, - &position_out); + if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) { + voronoi_smooth_f1_3d(coord, + smoothness, + exponent, + randomness, + voronoi_metric, + &distance_out, + &color_out, + &position_out); + } break; -#endif case NODE_VORONOI_F2: voronoi_f2_3d(coord, exponent, @@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, break; } -#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA) case 4: { - float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w); - float4 position_out_4d; - switch (voronoi_feature) { - case NODE_VORONOI_F1: - voronoi_f1_4d(coord_4d, - exponent, - randomness, - voronoi_metric, - &distance_out, - &color_out, - &position_out_4d); - break; - case NODE_VORONOI_SMOOTH_F1: - voronoi_smooth_f1_4d(coord_4d, - smoothness, - exponent, - randomness, - voronoi_metric, - &distance_out, - &color_out, - &position_out_4d); - break; - case NODE_VORONOI_F2: - voronoi_f2_4d(coord_4d, - exponent, - randomness, - voronoi_metric, - &distance_out, - &color_out, - &position_out_4d); - break; - case NODE_VORONOI_DISTANCE_TO_EDGE: - voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out); - break; - case NODE_VORONOI_N_SPHERE_RADIUS: - voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out); - break; - default: - kernel_assert(0); + if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) { + float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w); + float4 position_out_4d; + switch (voronoi_feature) { + case NODE_VORONOI_F1: + voronoi_f1_4d(coord_4d, + exponent, + randomness, + voronoi_metric, + &distance_out, + &color_out, + &position_out_4d); + break; + case NODE_VORONOI_SMOOTH_F1: + voronoi_smooth_f1_4d(coord_4d, + smoothness, + exponent, + randomness, + voronoi_metric, + &distance_out, + &color_out, + &position_out_4d); + break; + case NODE_VORONOI_F2: + voronoi_f2_4d(coord_4d, + exponent, + randomness, + voronoi_metric, + &distance_out, + &color_out, + &position_out_4d); + break; + case NODE_VORONOI_DISTANCE_TO_EDGE: + voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out); + break; + case NODE_VORONOI_N_SPHERE_RADIUS: + voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out); + break; + default: + kernel_assert(0); + } + position_out_4d = safe_divide_float4_float(position_out_4d, scale); + position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z); + w_out = position_out_4d.w; } - position_out_4d = safe_divide_float4_float(position_out_4d, scale); - position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z); - w_out = position_out_4d.w; break; } -#endif default: kernel_assert(0); } @@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, stack_store_float(stack, w_out_stack_offset, w_out); if (stack_valid(radius_out_stack_offset)) stack_store_float(stack, radius_out_stack_offset, radius_out); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index 4bc14f82382..78b75405356 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN /* TODO(sergey): Think of making it more generic volume-type attribute * sampler. */ -ccl_device void svm_node_tex_voxel( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_voxel( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { uint co_offset, density_out_offset, color_out_offset, space; svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space); @@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel( else { kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD); Transform tfm; - tfm.x = read_node_float(kg, offset); - tfm.y = read_node_float(kg, offset); - tfm.z = read_node_float(kg, offset); + tfm.x = read_node_float(kg, &offset); + tfm.y = read_node_float(kg, &offset); + tfm.z = read_node_float(kg, &offset); co = transform_point(&tfm, co); } @@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel( stack_store_float(stack, density_out_offset, r.w); if (stack_valid(color_out_offset)) stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z)); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h index c4763475b47..00f980c16df 100644 --- a/intern/cycles/kernel/svm/svm_wave.h +++ b/intern/cycles/kernel/svm/svm_wave.h @@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type, } } -ccl_device void svm_node_tex_wave( - KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +ccl_device_noinline int svm_node_tex_wave( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) { - uint4 node2 = read_node(kg, offset); - uint4 node3 = read_node(kg, offset); + uint4 node2 = read_node(kg, &offset); + uint4 node3 = read_node(kg, &offset); /* RNA properties */ uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset; @@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave( stack_store_float(stack, fac_offset, f); if (stack_valid(color_offset)) stack_store_float3(stack, color_offset, make_float3(f, f, f)); + return offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h index d6144802559..fba8aa63d31 100644 --- a/intern/cycles/kernel/svm/svm_wavelength.h +++ b/intern/cycles/kernel/svm/svm_wavelength.h @@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = { {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}}; -ccl_device void svm_node_wavelength( - KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out) +ccl_device_noinline void svm_node_wavelength( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out) { float lambda_nm = stack_load_float(stack, wavelength); float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f); // scaled 0..80 diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h index b30d85acaec..0306d2e7b9c 100644 --- a/intern/cycles/kernel/svm/svm_white_noise.h +++ b/intern/cycles/kernel/svm/svm_white_noise.h @@ -16,13 +16,12 @@ CCL_NAMESPACE_BEGIN -ccl_device void svm_node_tex_white_noise(KernelGlobals *kg, - ShaderData *sd, - float *stack, - uint dimensions, - uint inputs_stack_offsets, - uint ouptuts_stack_offsets, - int *offset) +ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint dimensions, + uint inputs_stack_offsets, + uint ouptuts_stack_offsets) { uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset; svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset); diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 49158bd86d5..7ec913789d2 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN /* Wireframe Node */ ccl_device_inline float wireframe( - KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P) + const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P) { #ifdef __HAIR__ if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) @@ -88,7 +88,10 @@ ccl_device_inline float wireframe( return 0.0f; } -ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg, + ShaderData *sd, + float *stack, + uint4 node) { uint in_size = node.y; uint out_fac = node.z; @@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta int pixel_size = (int)use_pixel_size; /* Calculate wireframe */ -#ifdef __SPLIT_KERNEL__ - /* TODO(sergey): This is because sd is actually a global space, - * which makes it difficult to re-use same wireframe() function. - * - * With OpenCL 2.0 it's possible to avoid this change, but for until - * then we'll be living with such an exception. - */ - float3 P = sd->P; - float f = wireframe(kg, sd, size, pixel_size, &P); -#else float f = wireframe(kg, sd, size, pixel_size, &sd->P); -#endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if (bump_offset == NODE_BUMP_OFFSET_DX) { diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index feead27c5ca..6edb5261b32 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -32,10 +32,10 @@ set(SRC camera.cpp colorspace.cpp constant_fold.cpp - coverage.cpp denoising.cpp film.cpp geometry.cpp + gpu_display.cpp graph.cpp hair.cpp image.cpp @@ -54,6 +54,7 @@ set(SRC object.cpp osl.cpp particles.cpp + pass.cpp curves.cpp scene.cpp session.cpp @@ -76,10 +77,10 @@ set(SRC_HEADERS camera.h colorspace.h constant_fold.h - coverage.h denoising.h film.h geometry.h + gpu_display.h graph.h hair.h image.h @@ -95,6 +96,7 @@ set(SRC_HEADERS object.h osl.h particles.h + pass.h procedural.h curves.h scene.h @@ -111,6 +113,7 @@ set(SRC_HEADERS set(LIB cycles_bvh cycles_device + cycles_integrator cycles_subd cycles_util ) diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp index b925e755434..ae6290ac27b 100644 --- a/intern/cycles/render/background.cpp +++ b/intern/cycles/render/background.cpp @@ -34,11 +34,7 @@ NODE_DEFINE(Background) { NodeType *type = NodeType::add("background", create); - SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f); - SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX); - SOCKET_BOOLEAN(use_shader, "Use Shader", true); - SOCKET_BOOLEAN(use_ao, "Use AO", false); SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY); SOCKET_BOOLEAN(transparent, "Transparent", false); @@ -80,10 +76,6 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene /* set shader index and transparent option */ KernelBackground *kbackground = &dscene->data.background; - kbackground->ao_factor = (use_ao) ? ao_factor : 0.0f; - kbackground->ao_bounces_factor = ao_factor; - kbackground->ao_distance = ao_distance; - kbackground->transparent = transparent; kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader); @@ -138,10 +130,6 @@ void Background::tag_update(Scene *scene) * and to avoid doing unnecessary updates anywhere else. */ tag_use_shader_modified(); } - - if (ao_factor_is_modified() || use_ao_is_modified()) { - scene->integrator->tag_update(scene, Integrator::BACKGROUND_AO_MODIFIED); - } } Shader *Background::get_shader(const Scene *scene) diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h index e89ffbc2445..2f7ef0f7737 100644 --- a/intern/cycles/render/background.h +++ b/intern/cycles/render/background.h @@ -32,11 +32,7 @@ class Background : public Node { public: NODE_DECLARE - NODE_SOCKET_API(float, ao_factor) - NODE_SOCKET_API(float, ao_distance) - NODE_SOCKET_API(bool, use_shader) - NODE_SOCKET_API(bool, use_ao) NODE_SOCKET_API(uint, visibility) NODE_SOCKET_API(Shader *, shader) diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index 317a3937cab..54e496caed6 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -26,58 +26,8 @@ CCL_NAMESPACE_BEGIN -static int aa_samples(Scene *scene, Object *object, ShaderEvalType type) -{ - if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) { - return 1; - } - else if (type == SHADER_EVAL_NORMAL) { - /* Only antialias normal if mesh has bump mapping. */ - if (object->get_geometry()) { - foreach (Node *node, object->get_geometry()->get_used_shaders()) { - Shader *shader = static_cast<Shader *>(node); - if (shader->has_bump) { - return scene->integrator->get_aa_samples(); - } - } - } - - return 1; - } - else { - return scene->integrator->get_aa_samples(); - } -} - -/* Keep it synced with kernel_bake.h logic */ -static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter) -{ - const int component_flags = pass_filter & - (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR); - - switch (type) { - case SHADER_EVAL_AO: - return BAKE_FILTER_AO; - case SHADER_EVAL_SHADOW: - return BAKE_FILTER_DIRECT; - case SHADER_EVAL_DIFFUSE: - return BAKE_FILTER_DIFFUSE | component_flags; - case SHADER_EVAL_GLOSSY: - return BAKE_FILTER_GLOSSY | component_flags; - case SHADER_EVAL_TRANSMISSION: - return BAKE_FILTER_TRANSMISSION | component_flags; - case SHADER_EVAL_COMBINED: - return pass_filter; - default: - return 0; - } -} - BakeManager::BakeManager() { - type = SHADER_EVAL_BAKE; - pass_filter = 0; - need_update_ = true; } @@ -85,32 +35,14 @@ BakeManager::~BakeManager() { } -bool BakeManager::get_baking() +bool BakeManager::get_baking() const { return !object_name.empty(); } -void BakeManager::set(Scene *scene, - const std::string &object_name_, - ShaderEvalType type_, - int pass_filter_) +void BakeManager::set(Scene *scene, const std::string &object_name_) { object_name = object_name_; - type = type_; - pass_filter = shader_type_to_pass_filter(type_, pass_filter_); - - Pass::add(PASS_BAKE_PRIMITIVE, scene->passes); - Pass::add(PASS_BAKE_DIFFERENTIAL, scene->passes); - - if (type == SHADER_EVAL_UV) { - /* force UV to be available */ - Pass::add(PASS_UV, scene->passes); - } - - /* force use_light_pass to be true if we bake more than just colors */ - if (pass_filter & ~BAKE_FILTER_COLOR) { - Pass::add(PASS_LIGHT, scene->passes); - } /* create device and update scene */ scene->film->tag_modified(); @@ -127,29 +59,29 @@ void BakeManager::device_update(Device * /*device*/, if (!need_update()) return; - scoped_callback_timer timer([scene](double time) { - if (scene->update_stats) { - scene->update_stats->bake.times.add_entry({"device_update", time}); - } - }); - - KernelIntegrator *kintegrator = &dscene->data.integrator; KernelBake *kbake = &dscene->data.bake; + memset(kbake, 0, sizeof(*kbake)); - kbake->type = type; - kbake->pass_filter = pass_filter; - - int object_index = 0; - foreach (Object *object, scene->objects) { - const Geometry *geom = object->get_geometry(); - if (object->name == object_name && geom->geometry_type == Geometry::MESH) { - kbake->object_index = object_index; - kbake->tri_offset = geom->prim_offset; - kintegrator->aa_samples = aa_samples(scene, object, type); - break; - } + if (!object_name.empty()) { + scoped_callback_timer timer([scene](double time) { + if (scene->update_stats) { + scene->update_stats->bake.times.add_entry({"device_update", time}); + } + }); + + kbake->use = true; - object_index++; + int object_index = 0; + foreach (Object *object, scene->objects) { + const Geometry *geom = object->get_geometry(); + if (object->name == object_name && geom->geometry_type == Geometry::MESH) { + kbake->object_index = object_index; + kbake->tri_offset = geom->prim_offset; + break; + } + + object_index++; + } } need_update_ = false; diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h index 655b9b1cf7e..39e504490c2 100644 --- a/intern/cycles/render/bake.h +++ b/intern/cycles/render/bake.h @@ -30,8 +30,8 @@ class BakeManager { BakeManager(); ~BakeManager(); - void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter); - bool get_baking(); + void set(Scene *scene, const std::string &object_name); + bool get_baking() const; void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress); void device_free(Device *device, DeviceScene *dscene); @@ -42,8 +42,6 @@ class BakeManager { private: bool need_update_; - ShaderEvalType type; - int pass_filter; std::string object_name; }; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fcfad58995e..1cdae3af7f5 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -28,537 +28,334 @@ CCL_NAMESPACE_BEGIN -/* Buffer Params */ +/* -------------------------------------------------------------------- + * Convert part information to an index of `BufferParams::pass_offset_`. + */ -BufferParams::BufferParams() +static int pass_type_mode_to_index(PassType pass_type, PassMode mode) { - width = 0; - height = 0; - - full_x = 0; - full_y = 0; - full_width = 0; - full_height = 0; + int index = static_cast<int>(pass_type) * 2; - denoising_data_pass = false; - denoising_clean_pass = false; - denoising_prefiltered_pass = false; + if (mode == PassMode::DENOISED) { + ++index; + } - Pass::add(PASS_COMBINED, passes); + return index; } -void BufferParams::get_offset_stride(int &offset, int &stride) +static int pass_to_index(const BufferPass &pass) { - offset = -(full_x + full_y * width); - stride = width; + return pass_type_mode_to_index(pass.type, pass.mode); } -bool BufferParams::modified(const BufferParams ¶ms) -{ - return !(full_x == params.full_x && full_y == params.full_y && width == params.width && - height == params.height && full_width == params.full_width && - full_height == params.full_height && Pass::equals(passes, params.passes) && - denoising_data_pass == params.denoising_data_pass && - denoising_clean_pass == params.denoising_clean_pass && - denoising_prefiltered_pass == params.denoising_prefiltered_pass); -} +/* -------------------------------------------------------------------- + * Buffer pass. + */ -int BufferParams::get_passes_size() +NODE_DEFINE(BufferPass) { - int size = 0; + NodeType *type = NodeType::add("buffer_pass", create); - for (size_t i = 0; i < passes.size(); i++) - size += passes[i].components; + const NodeEnum *pass_type_enum = Pass::get_type_enum(); + const NodeEnum *pass_mode_enum = Pass::get_mode_enum(); - if (denoising_data_pass) { - size += DENOISING_PASS_SIZE_BASE; - if (denoising_clean_pass) - size += DENOISING_PASS_SIZE_CLEAN; - if (denoising_prefiltered_pass) - size += DENOISING_PASS_SIZE_PREFILTERED; - } + SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED); + SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED)); + SOCKET_STRING(name, "Name", ustring()); + SOCKET_BOOLEAN(include_albedo, "Include Albedo", false); - return align_up(size, 4); -} + SOCKET_INT(offset, "Offset", -1); -int BufferParams::get_denoising_offset() -{ - int offset = 0; - - for (size_t i = 0; i < passes.size(); i++) - offset += passes[i].components; - - return offset; + return type; } -int BufferParams::get_denoising_prefiltered_offset() +BufferPass::BufferPass() : Node(get_node_type()) { - assert(denoising_prefiltered_pass); - - int offset = get_denoising_offset(); - - offset += DENOISING_PASS_SIZE_BASE; - if (denoising_clean_pass) { - offset += DENOISING_PASS_SIZE_CLEAN; - } - - return offset; } -/* Render Buffer Task */ - -RenderTile::RenderTile() +BufferPass::BufferPass(const Pass *scene_pass) + : Node(get_node_type()), + type(scene_pass->get_type()), + mode(scene_pass->get_mode()), + name(scene_pass->get_name()), + include_albedo(scene_pass->get_include_albedo()) { - x = 0; - y = 0; - w = 0; - h = 0; - - sample = 0; - start_sample = 0; - num_samples = 0; - resolution = 0; - - offset = 0; - stride = 0; - - buffer = 0; - - buffers = NULL; - stealing_state = NO_STEALING; } -/* Render Buffers */ - -RenderBuffers::RenderBuffers(Device *device) - : buffer(device, "RenderBuffers", MEM_READ_WRITE), - map_neighbor_copied(false), - render_time(0.0f) +PassInfo BufferPass::get_info() const { + return Pass::get_info(type, include_albedo); } -RenderBuffers::~RenderBuffers() -{ - buffer.free(); -} +/* -------------------------------------------------------------------- + * Buffer Params. + */ -void RenderBuffers::reset(BufferParams ¶ms_) +NODE_DEFINE(BufferParams) { - params = params_; - - /* re-allocate buffer */ - buffer.alloc(params.width * params.get_passes_size(), params.height); - buffer.zero_to_device(); + NodeType *type = NodeType::add("buffer_params", create); + + SOCKET_INT(width, "Width", 0); + SOCKET_INT(height, "Height", 0); + + SOCKET_INT(full_x, "Full X", 0); + SOCKET_INT(full_y, "Full Y", 0); + SOCKET_INT(full_width, "Full Width", 0); + SOCKET_INT(full_height, "Full Height", 0); + + SOCKET_STRING(layer, "Layer", ustring()); + SOCKET_STRING(view, "View", ustring()); + SOCKET_FLOAT(exposure, "Exposure", 1.0f); + SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false); + SOCKET_BOOLEAN(use_transparent_background, "Transparent Background", false); + + /* Notes: + * - Skip passes since they do not follow typical container socket definition. + * Might look into covering those as a socket in the future. + * + * - Skip offset, stride, and pass stride since those can be delivered from the passes and + * rest of the sockets. */ + + return type; } -void RenderBuffers::zero() +BufferParams::BufferParams() : Node(get_node_type()) { - buffer.zero_to_device(); + reset_pass_offset(); } -bool RenderBuffers::copy_from_device() +void BufferParams::update_passes() { - if (!buffer.device_pointer) - return false; - - buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height); - - return true; -} - -bool RenderBuffers::get_denoising_pass_rect( - int type, float exposure, int sample, int components, float *pixels) -{ - if (buffer.data() == NULL) { - return false; - } - - float scale = 1.0f; - float alpha_scale = 1.0f / sample; - if (type == DENOISING_PASS_PREFILTERED_COLOR || type == DENOISING_PASS_CLEAN || - type == DENOISING_PASS_PREFILTERED_INTENSITY) { - scale *= exposure; - } - else if (type == DENOISING_PASS_PREFILTERED_VARIANCE) { - scale *= exposure * exposure * (sample - 1); - } + update_offset_stride(); + reset_pass_offset(); + + pass_stride = 0; + for (const BufferPass &pass : passes) { + if (pass.offset != PASS_UNUSED) { + const int index = pass_to_index(pass); + if (pass_offset_[index] == PASS_UNUSED) { + pass_offset_[index] = pass_stride; + } - int offset; - if (type == DENOISING_PASS_CLEAN) { - /* The clean pass isn't changed by prefiltering, so we use the original one there. */ - offset = type + params.get_denoising_offset(); - scale /= sample; - } - else if (params.denoising_prefiltered_pass) { - offset = type + params.get_denoising_prefiltered_offset(); - } - else { - switch (type) { - case DENOISING_PASS_PREFILTERED_DEPTH: - offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH; - break; - case DENOISING_PASS_PREFILTERED_NORMAL: - offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL; - break; - case DENOISING_PASS_PREFILTERED_ALBEDO: - offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO; - break; - case DENOISING_PASS_PREFILTERED_COLOR: - /* If we're not saving the prefiltering result, return the original noisy pass. */ - offset = params.get_denoising_offset() + DENOISING_PASS_COLOR; - break; - default: - return false; + pass_stride += pass.get_info().num_components; } - scale /= sample; } +} - int pass_stride = params.get_passes_size(); - int size = params.width * params.height; +void BufferParams::update_passes(const vector<Pass *> &scene_passes) +{ + passes.clear(); - float *in = buffer.data() + offset; + pass_stride = 0; + for (const Pass *scene_pass : scene_passes) { + BufferPass buffer_pass(scene_pass); - if (components == 1) { - for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - pixels[0] = in[0] * scale; + if (scene_pass->is_written()) { + buffer_pass.offset = pass_stride; + pass_stride += scene_pass->get_info().num_components; } - } - else if (components == 3) { - for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - pixels[0] = in[0] * scale; - pixels[1] = in[1] * scale; - pixels[2] = in[2] * scale; - } - } - else if (components == 4) { - /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ - assert(params.passes[0].type == PASS_COMBINED); - float *in_combined = buffer.data(); - - for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { - float3 val = make_float3(in[0], in[1], in[2]); - if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) { - /* Remove highlight compression from the image. */ - val = color_highlight_uncompress(val); - } - pixels[0] = val.x * scale; - pixels[1] = val.y * scale; - pixels[2] = val.z * scale; - pixels[3] = saturate(in_combined[3] * alpha_scale); + else { + buffer_pass.offset = PASS_UNUSED; } - } - else { - return false; + + passes.emplace_back(std::move(buffer_pass)); } - return true; + update_passes(); } -bool RenderBuffers::get_pass_rect( - const string &name, float exposure, int sample, int components, float *pixels) +void BufferParams::reset_pass_offset() { - if (buffer.data() == NULL) { - return false; + for (int i = 0; i < kNumPassOffsets; ++i) { + pass_offset_[i] = PASS_UNUSED; } +} - float *sample_count = NULL; - if (name == "Combined") { - int sample_offset = 0; - for (size_t j = 0; j < params.passes.size(); j++) { - Pass &pass = params.passes[j]; - if (pass.type != PASS_SAMPLE_COUNT) { - sample_offset += pass.components; - continue; - } - else { - sample_count = buffer.data() + sample_offset; - break; - } - } +int BufferParams::get_pass_offset(PassType pass_type, PassMode mode) const +{ + if (pass_type == PASS_NONE || pass_type == PASS_UNUSED) { + return PASS_UNUSED; } - int pass_offset = 0; - - for (size_t j = 0; j < params.passes.size(); j++) { - Pass &pass = params.passes[j]; + const int index = pass_type_mode_to_index(pass_type, mode); + return pass_offset_[index]; +} - /* Pass is identified by both type and name, multiple of the same type - * may exist with a different name. */ - if (pass.name != name) { - pass_offset += pass.components; - continue; +const BufferPass *BufferParams::find_pass(string_view name) const +{ + for (const BufferPass &pass : passes) { + if (pass.name == name) { + return &pass; } + } - PassType type = pass.type; - - float *in = buffer.data() + pass_offset; - int pass_stride = params.get_passes_size(); - - float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f; - float scale_exposure = (pass.exposure) ? scale * exposure : scale; - - int size = params.width * params.height; + return nullptr; +} - if (components == 1 && type == PASS_RENDER_TIME) { - /* Render time is not stored by kernel, but measured per tile. */ - float val = (float)(1000.0 * render_time / (params.width * params.height * sample)); - for (int i = 0; i < size; i++, pixels++) { - pixels[0] = val; - } - } - else if (components == 1) { - assert(pass.components == components); - - /* Scalar */ - if (type == PASS_DEPTH) { - for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure; - } - } - else if (type == PASS_MIST) { - for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = saturate(f * scale_exposure); - } - } - else { - for (int i = 0; i < size; i++, in += pass_stride, pixels++) { - float f = *in; - pixels[0] = f * scale_exposure; - } - } - } - else if (components == 3) { - assert(pass.components == 4); - - /* RGBA */ - if (type == PASS_SHADOW) { - for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; - - pixels[0] = f.x * invw; - pixels[1] = f.y * invw; - pixels[2] = f.z * invw; - } - } - else if (pass.divide_type != PASS_NONE) { - /* RGB lighting passes that need to divide out color */ - pass_offset = 0; - for (size_t k = 0; k < params.passes.size(); k++) { - Pass &color_pass = params.passes[k]; - if (color_pass.type == pass.divide_type) - break; - pass_offset += color_pass.components; - } - - float *in_divide = buffer.data() + pass_offset; - - for (int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) { - float3 f = make_float3(in[0], in[1], in[2]); - float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]); - - f = safe_divide_even_color(f * exposure, f_divide); - - pixels[0] = f.x; - pixels[1] = f.y; - pixels[2] = f.z; - } - } - else { - /* RGB/vector */ - for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - float3 f = make_float3(in[0], in[1], in[2]); - - pixels[0] = f.x * scale_exposure; - pixels[1] = f.y * scale_exposure; - pixels[2] = f.z * scale_exposure; - } - } - } - else if (components == 4) { - assert(pass.components == components); - - /* RGBA */ - if (type == PASS_SHADOW) { - for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f; - - pixels[0] = f.x * invw; - pixels[1] = f.y * invw; - pixels[2] = f.z * invw; - pixels[3] = 1.0f; - } - } - else if (type == PASS_MOTION) { - /* need to normalize by number of samples accumulated for motion */ - pass_offset = 0; - for (size_t k = 0; k < params.passes.size(); k++) { - Pass &color_pass = params.passes[k]; - if (color_pass.type == PASS_MOTION_WEIGHT) - break; - pass_offset += color_pass.components; - } - - float *in_weight = buffer.data() + pass_offset; - - for (int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - float w = in_weight[0]; - float invw = (w > 0.0f) ? 1.0f / w : 0.0f; - - pixels[0] = f.x * invw; - pixels[1] = f.y * invw; - pixels[2] = f.z * invw; - pixels[3] = f.w * invw; - } - } - else if (type == PASS_CRYPTOMATTE) { - for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - float4 f = make_float4(in[0], in[1], in[2], in[3]); - /* x and z contain integer IDs, don't rescale them. - y and w contain matte weights, they get scaled. */ - pixels[0] = f.x; - pixels[1] = f.y * scale; - pixels[2] = f.z; - pixels[3] = f.w * scale; - } - } - else { - for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { - if (sample_count && sample_count[i * pass_stride] < 0.0f) { - scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f; - scale_exposure = (pass.exposure) ? scale * exposure : scale; - } - - float4 f = make_float4(in[0], in[1], in[2], in[3]); - - pixels[0] = f.x * scale_exposure; - pixels[1] = f.y * scale_exposure; - pixels[2] = f.z * scale_exposure; - - /* Clamp since alpha might be > 1.0 due to Russian roulette. */ - pixels[3] = saturate(f.w * scale); - } - } +const BufferPass *BufferParams::find_pass(PassType type, PassMode mode) const +{ + for (const BufferPass &pass : passes) { + if (pass.type == type && pass.mode == mode) { + return &pass; } - - return true; } - return false; + return nullptr; } -bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels, int samples) +const BufferPass *BufferParams::get_actual_display_pass(PassType type, PassMode mode) const { - if (buffer.data() == NULL) { - return false; - } - - int pass_offset = 0; + const BufferPass *pass = find_pass(type, mode); + return get_actual_display_pass(pass); +} - for (size_t j = 0; j < params.passes.size(); j++) { - Pass &pass = params.passes[j]; +const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass) const +{ + if (!pass) { + return nullptr; + } - if (pass.type != type) { - pass_offset += pass.components; - continue; + if (pass->type == PASS_COMBINED) { + const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode); + if (shadow_catcher_matte_pass) { + pass = shadow_catcher_matte_pass; } + } - float *out = buffer.data() + pass_offset; - int pass_stride = params.get_passes_size(); - int size = params.width * params.height; - - assert(pass.components == components); + return pass; +} - for (int i = 0; i < size; i++, out += pass_stride, pixels += components) { - if (pass.filter) { - /* Scale by the number of samples, inverse of what we do in get_pass_rect. - * A better solution would be to remove the need for set_pass_rect entirely, - * and change baking to bake multiple objects in a tile at once. */ - for (int j = 0; j < components; j++) { - out[j] = pixels[j] * samples; - } - } - else { - /* For non-filtered passes just straight copy, these may contain non-float data. */ - memcpy(out, pixels, sizeof(float) * components); - } - } +void BufferParams::update_offset_stride() +{ + offset = -(full_x + full_y * width); + stride = width; +} +bool BufferParams::modified(const BufferParams &other) const +{ + if (!(width == other.width && height == other.height && full_x == other.full_x && + full_y == other.full_y && full_width == other.full_width && + full_height == other.full_height && offset == other.offset && stride == other.stride && + pass_stride == other.pass_stride && layer == other.layer && view == other.view && + exposure == other.exposure && + use_approximate_shadow_catcher == other.use_approximate_shadow_catcher && + use_transparent_background == other.use_transparent_background)) { return true; } - return false; + return !(passes == other.passes); } -/* Display Buffer */ +/* -------------------------------------------------------------------- + * Render Buffers. + */ -DisplayBuffer::DisplayBuffer(Device *device, bool linear) - : draw_width(0), - draw_height(0), - transparent(true), /* todo: determine from background */ - half_float(linear), - rgba_byte(device, "display buffer byte"), - rgba_half(device, "display buffer half") +RenderBuffers::RenderBuffers(Device *device) : buffer(device, "RenderBuffers", MEM_READ_WRITE) { } -DisplayBuffer::~DisplayBuffer() +RenderBuffers::~RenderBuffers() { - rgba_byte.free(); - rgba_half.free(); + buffer.free(); } -void DisplayBuffer::reset(BufferParams ¶ms_) +void RenderBuffers::reset(const BufferParams ¶ms_) { - draw_width = 0; - draw_height = 0; + DCHECK(params_.pass_stride != -1); params = params_; - /* allocate display pixels */ - if (half_float) { - rgba_half.alloc_to_device(params.width, params.height); - } - else { - rgba_byte.alloc_to_device(params.width, params.height); - } + /* re-allocate buffer */ + buffer.alloc(params.width * params.pass_stride, params.height); } -void DisplayBuffer::draw_set(int width, int height) +void RenderBuffers::zero() { - assert(width <= params.width && height <= params.height); + buffer.zero_to_device(); +} - draw_width = width; - draw_height = height; +bool RenderBuffers::copy_from_device() +{ + DCHECK(params.pass_stride != -1); + + if (!buffer.device_pointer) + return false; + + buffer.copy_from_device(0, params.width * params.pass_stride, params.height); + + return true; } -void DisplayBuffer::draw(Device *device, const DeviceDrawParams &draw_params) +void RenderBuffers::copy_to_device() { - if (draw_width != 0 && draw_height != 0) { - device_memory &rgba = (half_float) ? (device_memory &)rgba_half : (device_memory &)rgba_byte; - - device->draw_pixels(rgba, - 0, - draw_width, - draw_height, - params.width, - params.height, - params.full_x, - params.full_y, - params.full_width, - params.full_height, - transparent, - draw_params); - } + buffer.copy_to_device(); } -bool DisplayBuffer::draw_ready() +void render_buffers_host_copy_denoised(RenderBuffers *dst, + const BufferParams &dst_params, + const RenderBuffers *src, + const BufferParams &src_params, + const size_t src_offset) { - return (draw_width != 0 && draw_height != 0); + DCHECK_EQ(dst_params.width, src_params.width); + /* TODO(sergey): More sanity checks to avoid buffer overrun. */ + + /* Create a map of pass ofsets to be copied. + * Assume offsets are different to allow copying passes between buffers with different set of + * passes. */ + + struct { + int dst_offset; + int src_offset; + } pass_offsets[PASS_NUM]; + + int num_passes = 0; + + for (int i = 0; i < PASS_NUM; ++i) { + const PassType pass_type = static_cast<PassType>(i); + + const int dst_pass_offset = dst_params.get_pass_offset(pass_type, PassMode::DENOISED); + if (dst_pass_offset == PASS_UNUSED) { + continue; + } + + const int src_pass_offset = src_params.get_pass_offset(pass_type, PassMode::DENOISED); + if (src_pass_offset == PASS_UNUSED) { + continue; + } + + pass_offsets[num_passes].dst_offset = dst_pass_offset; + pass_offsets[num_passes].src_offset = src_pass_offset; + ++num_passes; + } + + /* Copy passes. */ + /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */ + + const int64_t dst_width = dst_params.width; + const int64_t dst_height = dst_params.height; + const int64_t dst_pass_stride = dst_params.pass_stride; + const int64_t dst_num_pixels = dst_width * dst_height; + + const int64_t src_pass_stride = src_params.pass_stride; + const int64_t src_offset_in_floats = src_offset * src_pass_stride; + + const float *src_pixel = src->buffer.data() + src_offset_in_floats; + float *dst_pixel = dst->buffer.data(); + + for (int i = 0; i < dst_num_pixels; + ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride) { + for (int pass_offset_idx = 0; pass_offset_idx < num_passes; ++pass_offset_idx) { + const int dst_pass_offset = pass_offsets[pass_offset_idx].dst_offset; + const int src_pass_offset = pass_offsets[pass_offset_idx].src_offset; + + /* TODO(sergey): Support non-RGBA passes. */ + dst_pixel[dst_pass_offset + 0] = src_pixel[src_pass_offset + 0]; + dst_pixel[dst_pass_offset + 1] = src_pixel[src_pass_offset + 1]; + dst_pixel[dst_pass_offset + 2] = src_pixel[src_pass_offset + 2]; + dst_pixel[dst_pass_offset + 3] = src_pixel[src_pass_offset + 3]; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 4ffc628bb52..c048234167d 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -18,8 +18,8 @@ #define __BUFFERS_H__ #include "device/device_memory.h" - -#include "render/film.h" +#include "graph/node.h" +#include "render/pass.h" #include "kernel/kernel_types.h" @@ -34,170 +34,156 @@ class Device; struct DeviceDrawParams; struct float4; +/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */ +class BufferPass : public Node { + public: + NODE_DECLARE + + PassType type = PASS_NONE; + PassMode mode = PassMode::NOISY; + ustring name; + bool include_albedo = false; + + int offset = -1; + + BufferPass(); + explicit BufferPass(const Pass *scene_pass); + + BufferPass(BufferPass &&other) noexcept = default; + BufferPass(const BufferPass &other) = default; + + BufferPass &operator=(BufferPass &&other) = default; + BufferPass &operator=(const BufferPass &other) = default; + + ~BufferPass() = default; + + PassInfo get_info() const; + + inline bool operator==(const BufferPass &other) const + { + return type == other.type && mode == other.mode && name == other.name && + include_albedo == other.include_albedo && offset == other.offset; + } + inline bool operator!=(const BufferPass &other) const + { + return !(*this == other); + } +}; + /* Buffer Parameters * Size of render buffer and how it fits in the full image (border render). */ -class BufferParams { +/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */ +class BufferParams : public Node { public: - /* width/height of the physical buffer */ - int width; - int height; - - /* offset into and width/height of the full buffer */ - int full_x; - int full_y; - int full_width; - int full_height; - - /* passes */ - vector<Pass> passes; - bool denoising_data_pass; - /* If only some light path types should be target, an additional pass is needed. */ - bool denoising_clean_pass; - /* When we're prefiltering the passes during rendering, we need to keep both the - * original and the prefiltered data around because neighboring tiles might still - * need the original data. */ - bool denoising_prefiltered_pass; - - /* functions */ - BufferParams(); + NODE_DECLARE - void get_offset_stride(int &offset, int &stride); - bool modified(const BufferParams ¶ms); - int get_passes_size(); - int get_denoising_offset(); - int get_denoising_prefiltered_offset(); -}; + /* Width/height of the physical buffer. */ + int width = 0; + int height = 0; -/* Render Buffers */ + /* Offset into and width/height of the full buffer. */ + int full_x = 0; + int full_y = 0; + int full_width = 0; + int full_height = 0; -class RenderBuffers { - public: - /* buffer parameters */ - BufferParams params; + /* Runtime fields, only valid after `update_passes()` or `update_offset_stride()`. */ + int offset = -1, stride = -1; - /* float buffer */ - device_vector<float> buffer; - bool map_neighbor_copied; - double render_time; + /* Runtime fields, only valid after `update_passes()`. */ + int pass_stride = -1; - explicit RenderBuffers(Device *device); - ~RenderBuffers(); + /* Properties which are used for accessing buffer pixels outside of scene graph. */ + vector<BufferPass> passes; + ustring layer; + ustring view; + float exposure = 1.0f; + bool use_approximate_shadow_catcher = false; + bool use_transparent_background = false; - void reset(BufferParams ¶ms); - void zero(); + BufferParams(); - bool copy_from_device(); - bool get_pass_rect( - const string &name, float exposure, int sample, int components, float *pixels); - bool get_denoising_pass_rect( - int offset, float exposure, int sample, int components, float *pixels); - bool set_pass_rect(PassType type, int components, float *pixels, int samples); -}; + BufferParams(BufferParams &&other) noexcept = default; + BufferParams(const BufferParams &other) = default; -/* Display Buffer - * - * The buffer used for drawing during render, filled by converting the render - * buffers to byte of half float storage */ + BufferParams &operator=(BufferParams &&other) = default; + BufferParams &operator=(const BufferParams &other) = default; -class DisplayBuffer { - public: - /* buffer parameters */ - BufferParams params; - /* dimensions for how much of the buffer is actually ready for display. - * with progressive render we can be using only a subset of the buffer. - * if these are zero, it means nothing can be drawn yet */ - int draw_width, draw_height; - /* draw alpha channel? */ - bool transparent; - /* use half float? */ - bool half_float; - /* byte buffer for converted result */ - device_pixels<uchar4> rgba_byte; - device_pixels<half4> rgba_half; - - DisplayBuffer(Device *device, bool linear = false); - ~DisplayBuffer(); - - void reset(BufferParams ¶ms); - - void draw_set(int width, int height); - void draw(Device *device, const DeviceDrawParams &draw_params); - bool draw_ready(); -}; + ~BufferParams() = default; -/* Render Tile - * Rendering task on a buffer */ + /* Pre-calculate all fields which depends on the passes. + * + * When the scene passes are given, the buffer passes will be created from them and stored in + * this params, and then params are updated for those passes. + * The `update_passes()` without parameters updates offsets and stries which are stored outside + * of the passes. */ + void update_passes(); + void update_passes(const vector<Pass *> &scene_passes); -class RenderTile { - public: - typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task; + /* Returns PASS_UNUSED if there is no such pass in the buffer. */ + int get_pass_offset(PassType type, PassMode mode = PassMode::NOISY) const; - Task task; - int x, y, w, h; - int start_sample; - int num_samples; - int sample; - int resolution; - int offset; - int stride; - int tile_index; + /* Returns nullptr if pass with given name does not exist. */ + const BufferPass *find_pass(string_view name) const; + const BufferPass *find_pass(PassType type, PassMode mode = PassMode::NOISY) const; - device_ptr buffer; - int device_size; + /* Get display pass from its name. + * Will do special logic to replace combined pass with shadow catcher matte. */ + const BufferPass *get_actual_display_pass(PassType type, PassMode mode = PassMode::NOISY) const; + const BufferPass *get_actual_display_pass(const BufferPass *pass) const; - typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState; - StealingState stealing_state; + void update_offset_stride(); - RenderBuffers *buffers; + bool modified(const BufferParams &other) const; - RenderTile(); + protected: + void reset_pass_offset(); - int4 bounds() const - { - return make_int4(x, /* xmin */ - y, /* ymin */ - x + w, /* xmax */ - y + h); /* ymax */ - } + /* Multipled by 2 to be able to store noisy and denoised pass types. */ + static constexpr int kNumPassOffsets = PASS_NUM * 2; + + /* Indexed by an index derived from pass type and mode, indicates offset of the corresponding + * pass in the buffer. + * If there are multiple passes with same type and mode contains lowest offset of all of them. */ + int pass_offset_[kNumPassOffsets]; }; -/* Render Tile Neighbors - * Set of neighboring tiles used for denoising. Tile order: - * 0 1 2 - * 3 4 5 - * 6 7 8 */ +/* Render Buffers */ -class RenderTileNeighbors { +class RenderBuffers { public: - static const int SIZE = 9; - static const int CENTER = 4; + /* buffer parameters */ + BufferParams params; - RenderTile tiles[SIZE]; - RenderTile target; + /* float buffer */ + device_vector<float> buffer; - RenderTileNeighbors(const RenderTile ¢er) - { - tiles[CENTER] = center; - } + explicit RenderBuffers(Device *device); + ~RenderBuffers(); - int4 bounds() const - { - return make_int4(tiles[3].x, /* xmin */ - tiles[1].y, /* ymin */ - tiles[5].x + tiles[5].w, /* xmax */ - tiles[7].y + tiles[7].h); /* ymax */ - } + void reset(const BufferParams ¶ms); + void zero(); - void set_bounds_from_center() - { - tiles[3].x = tiles[CENTER].x; - tiles[1].y = tiles[CENTER].y; - tiles[5].x = tiles[CENTER].x + tiles[CENTER].w; - tiles[7].y = tiles[CENTER].y + tiles[CENTER].h; - } + bool copy_from_device(); + void copy_to_device(); }; +/* Copy denoised passes form source to destination. + * + * Buffer parameters are provided explicitly, allowing to copy pixelks between render buffers which + * content corresponds to a render result at a non-unit resolution divider. + * + * `src_offset` allows to offset source pixel index which is used when a fraction of the source + * buffer is to be copied. + * + * Copy happens of the number of pixels in the destination. */ +void render_buffers_host_copy_denoised(RenderBuffers *dst, + const BufferParams &dst_params, + const RenderBuffers *src, + const BufferParams &src_params, + const size_t src_offset = 0); + CCL_NAMESPACE_END #endif /* __BUFFERS_H__ */ diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index 327f166f9d8..8b69c971991 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -33,9 +33,9 @@ /* needed for calculating differentials */ // clang-format off -#include "kernel/kernel_compat_cpu.h" -#include "kernel/split/kernel_split_data.h" -#include "kernel/kernel_globals.h" +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + #include "kernel/kernel_projection.h" #include "kernel/kernel_differential.h" #include "kernel/kernel_montecarlo.h" @@ -169,7 +169,6 @@ Camera::Camera() : Node(get_node_type()) width = 1024; height = 512; - resolution = 1; use_perspective_motion = false; @@ -455,7 +454,6 @@ void Camera::update(Scene *scene) /* render size */ kcam->width = width; kcam->height = height; - kcam->resolution = resolution; /* store differentials */ kcam->dx = float3_to_float4(dx); @@ -776,9 +774,11 @@ float Camera::world_to_raster_size(float3 P) &ray); #endif - differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist); + /* TODO: would it help to use more accurate differentials here? */ + differential3 dP; + differential_transfer_compact(&dP, ray.dP, ray.D, ray.dD, ray.D, dist); - return max(len(ray.dP.dx), len(ray.dP.dy)); + return max(len(dP.dx), len(dP.dy)); } return res; @@ -789,12 +789,11 @@ bool Camera::use_motion() const return motion.size() > 1; } -void Camera::set_screen_size_and_resolution(int width_, int height_, int resolution_) +void Camera::set_screen_size(int width_, int height_) { - if (width_ != width || height_ != height || resolution_ != resolution) { + if (width_ != width || height_ != height) { width = width_; height = height_; - resolution = resolution_; tag_modified(); } } diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index 5abb4750764..cb8ecac1a7e 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -199,7 +199,6 @@ class Camera : public Node { private: int width; int height; - int resolution; public: /* functions */ @@ -225,7 +224,7 @@ class Camera : public Node { int motion_step(float time) const; bool use_motion() const; - void set_screen_size_and_resolution(int width_, int height_, int resolution_); + void set_screen_size(int width_, int height_); private: /* Private utility functions. */ diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp deleted file mode 100644 index 99d4daa6961..00000000000 --- a/intern/cycles/render/coverage.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright 2018 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "render/coverage.h" -#include "render/buffers.h" - -#include "kernel/kernel_compat_cpu.h" -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data.h" - -#include "kernel/kernel_globals.h" -#include "kernel/kernel_id_passes.h" - -#include "util/util_map.h" - -CCL_NAMESPACE_BEGIN - -static bool crypomatte_comp(const pair<float, float> &i, const pair<float, float> j) -{ - return i.first > j.first; -} - -void Coverage::finalize() -{ - int pass_offset = 0; - if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { - finalize_buffer(coverage_object, pass_offset); - pass_offset += kernel_data.film.cryptomatte_depth * 4; - } - if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { - finalize_buffer(coverage_material, pass_offset); - pass_offset += kernel_data.film.cryptomatte_depth * 4; - } - if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { - finalize_buffer(coverage_asset, pass_offset); - } -} - -void Coverage::init_path_trace() -{ - kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL; - - if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { - if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { - coverage_object.clear(); - coverage_object.resize(tile.w * tile.h); - } - if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { - coverage_material.clear(); - coverage_material.resize(tile.w * tile.h); - } - if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { - coverage_asset.clear(); - coverage_asset.resize(tile.w * tile.h); - } - } -} - -void Coverage::init_pixel(int x, int y) -{ - if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { - const int pixel_index = tile.w * (y - tile.y) + x - tile.x; - if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { - kg->coverage_object = &coverage_object[pixel_index]; - } - if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { - kg->coverage_material = &coverage_material[pixel_index]; - } - if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { - kg->coverage_asset = &coverage_asset[pixel_index]; - } - } -} - -void Coverage::finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset) -{ - if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { - flatten_buffer(coverage, pass_offset); - } - else { - sort_buffer(pass_offset); - } -} - -void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset) -{ - /* Sort the coverage map and write it to the output */ - int pixel_index = 0; - int pass_stride = tile.buffers->params.get_passes_size(); - for (int y = 0; y < tile.h; ++y) { - for (int x = 0; x < tile.w; ++x) { - const CoverageMap &pixel = coverage[pixel_index]; - if (!pixel.empty()) { - /* buffer offset */ - int index = x + y * tile.stride; - float *buffer = (float *)tile.buffer + index * pass_stride; - - /* sort the cryptomatte pixel */ - vector<pair<float, float>> sorted_pixel; - for (CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) { - sorted_pixel.push_back(std::make_pair(it->second, it->first)); - } - sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp); - int num_slots = 2 * (kernel_data.film.cryptomatte_depth); - if (sorted_pixel.size() > num_slots) { - float leftover = 0.0f; - for (vector<pair<float, float>>::iterator it = sorted_pixel.begin() + num_slots; - it != sorted_pixel.end(); - ++it) { - leftover += it->first; - } - sorted_pixel[num_slots - 1].first += leftover; - } - int limit = min(num_slots, sorted_pixel.size()); - for (int i = 0; i < limit; ++i) { - kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, - 2 * (kernel_data.film.cryptomatte_depth), - sorted_pixel[i].second, - sorted_pixel[i].first); - } - } - ++pixel_index; - } - } -} - -void Coverage::sort_buffer(const int pass_offset) -{ - /* Sort the coverage map and write it to the output */ - int pass_stride = tile.buffers->params.get_passes_size(); - for (int y = 0; y < tile.h; ++y) { - for (int x = 0; x < tile.w; ++x) { - /* buffer offset */ - int index = x + y * tile.stride; - float *buffer = (float *)tile.buffer + index * pass_stride; - kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, - 2 * (kernel_data.film.cryptomatte_depth)); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h deleted file mode 100644 index 12182c614da..00000000000 --- a/intern/cycles/render/coverage.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2018 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __COVERAGE_H__ -#define __COVERAGE_H__ - -#include "util/util_map.h" -#include "util/util_vector.h" - -CCL_NAMESPACE_BEGIN - -struct KernelGlobals; -class RenderTile; - -typedef unordered_map<float, float> CoverageMap; - -class Coverage { - public: - Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_) - { - } - void init_path_trace(); - void init_pixel(int x, int y); - void finalize(); - - private: - vector<CoverageMap> coverage_object; - vector<CoverageMap> coverage_material; - vector<CoverageMap> coverage_asset; - KernelGlobals *kg; - RenderTile &tile; - void finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset); - void flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset); - void sort_buffer(const int pass_offset); -}; - -CCL_NAMESPACE_END - -#endif /* __COVERAGE_H__ */ diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp index ddbe7484800..bcf8d3fa204 100644 --- a/intern/cycles/render/denoising.cpp +++ b/intern/cycles/render/denoising.cpp @@ -16,15 +16,17 @@ #include "render/denoising.h" -#include "kernel/filter/filter_defines.h" +#if 0 -#include "util/util_foreach.h" -#include "util/util_map.h" -#include "util/util_system.h" -#include "util/util_task.h" -#include "util/util_time.h" +# include "kernel/filter/filter_defines.h" -#include <OpenImageIO/filesystem.h> +# include "util/util_foreach.h" +# include "util/util_map.h" +# include "util/util_system.h" +# include "util/util_task.h" +# include "util/util_time.h" + +# include <OpenImageIO/filesystem.h> CCL_NAMESPACE_BEGIN @@ -225,7 +227,7 @@ bool DenoiseImageLayer::match_channels(int neighbor, /* Denoise Task */ DenoiseTask::DenoiseTask(Device *device, - Denoiser *denoiser, + DenoiserPipeline *denoiser, int frame, const vector<int> &neighbor_frames) : denoiser(denoiser), @@ -386,7 +388,6 @@ void DenoiseTask::create_task(DeviceTask &task) task.denoising = denoiser->params; task.denoising.type = DENOISER_NLM; task.denoising.use = true; - task.denoising.store_passes = false; task.denoising_from_render = false; task.denoising_frames.resize(neighbor_frames.size()); @@ -863,7 +864,7 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error) /* File pattern handling and outer loop over frames */ -Denoiser::Denoiser(DeviceInfo &device_info) +DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info) { samples_override = 0; tile_size = make_int2(64, 64); @@ -876,18 +877,16 @@ Denoiser::Denoiser(DeviceInfo &device_info) /* Initialize device. */ device = Device::create(device_info, stats, profiler, true); - DeviceRequestedFeatures req; - req.use_denoising = true; - device->load_kernels(req); + device->load_kernels(KERNEL_FEATURE_DENOISING); } -Denoiser::~Denoiser() +DenoiserPipeline::~DenoiserPipeline() { delete device; TaskScheduler::exit(); } -bool Denoiser::run() +bool DenoiserPipeline::run() { assert(input.size() == output.size()); @@ -931,3 +930,5 @@ bool Denoiser::run() } CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h index c1b4d0a5596..097cc570d06 100644 --- a/intern/cycles/render/denoising.h +++ b/intern/cycles/render/denoising.h @@ -17,27 +17,31 @@ #ifndef __DENOISING_H__ #define __DENOISING_H__ -#include "device/device.h" -#include "device/device_denoising.h" +#if 0 -#include "render/buffers.h" +/* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or + * parameters. Currently it is an annoying mixture of terms used interchangeably. */ -#include "util/util_string.h" -#include "util/util_unique_ptr.h" -#include "util/util_vector.h" +# include "device/device.h" -#include <OpenImageIO/imageio.h> +# include "render/buffers.h" + +# include "util/util_string.h" +# include "util/util_unique_ptr.h" +# include "util/util_vector.h" + +# include <OpenImageIO/imageio.h> OIIO_NAMESPACE_USING CCL_NAMESPACE_BEGIN -/* Denoiser */ +/* Denoiser pipeline */ -class Denoiser { +class DenoiserPipeline { public: - Denoiser(DeviceInfo &device_info); - ~Denoiser(); + DenoiserPipeline(DeviceInfo &device_info); + ~DenoiserPipeline(); bool run(); @@ -155,7 +159,10 @@ class DenoiseImage { class DenoiseTask { public: - DenoiseTask(Device *device, Denoiser *denoiser, int frame, const vector<int> &neighbor_frames); + DenoiseTask(Device *device, + DenoiserPipeline *denoiser, + int frame, + const vector<int> &neighbor_frames); ~DenoiseTask(); /* Task stages */ @@ -168,7 +175,7 @@ class DenoiseTask { protected: /* Denoiser parameters and device */ - Denoiser *denoiser; + DenoiserPipeline *denoiser; Device *device; /* Frame number to be denoised */ @@ -204,4 +211,6 @@ class DenoiseTask { CCL_NAMESPACE_END +#endif + #endif /* __DENOISING_H__ */ diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 5df396394c4..8e14b338bd3 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -16,9 +16,12 @@ #include "render/film.h" #include "device/device.h" +#include "render/background.h" +#include "render/bake.h" #include "render/camera.h" #include "render/integrator.h" #include "render/mesh.h" +#include "render/object.h" #include "render/scene.h" #include "render/stats.h" #include "render/tables.h" @@ -31,261 +34,6 @@ CCL_NAMESPACE_BEGIN -/* Pass */ - -static bool compare_pass_order(const Pass &a, const Pass &b) -{ - if (a.components == b.components) - return (a.type < b.type); - return (a.components > b.components); -} - -static NodeEnum *get_pass_type_enum() -{ - static NodeEnum pass_type_enum; - pass_type_enum.insert("combined", PASS_COMBINED); - pass_type_enum.insert("depth", PASS_DEPTH); - pass_type_enum.insert("normal", PASS_NORMAL); - pass_type_enum.insert("uv", PASS_UV); - pass_type_enum.insert("object_id", PASS_OBJECT_ID); - pass_type_enum.insert("material_id", PASS_MATERIAL_ID); - pass_type_enum.insert("motion", PASS_MOTION); - pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT); - pass_type_enum.insert("render_time", PASS_RENDER_TIME); - pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE); - pass_type_enum.insert("aov_color", PASS_AOV_COLOR); - pass_type_enum.insert("aov_value", PASS_AOV_VALUE); - pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER); - pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT); - pass_type_enum.insert("mist", PASS_MIST); - pass_type_enum.insert("emission", PASS_EMISSION); - pass_type_enum.insert("background", PASS_BACKGROUND); - pass_type_enum.insert("ambient_occlusion", PASS_AO); - pass_type_enum.insert("shadow", PASS_SHADOW); - pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT); - pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT); - pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR); - pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT); - pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT); - pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR); - pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT); - pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT); - pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR); - pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT); - pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT); - pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE); - pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL); - - return &pass_type_enum; -} - -NODE_DEFINE(Pass) -{ - NodeType *type = NodeType::add("pass", create); - - NodeEnum *pass_type_enum = get_pass_type_enum(); - SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED); - SOCKET_STRING(name, "Name", ustring()); - - return type; -} - -Pass::Pass() : Node(get_node_type()) -{ -} - -void Pass::add(PassType type, vector<Pass> &passes, const char *name) -{ - for (size_t i = 0; i < passes.size(); i++) { - if (passes[i].type != type) { - continue; - } - - /* An empty name is used as a placeholder to signal that any pass of - * that type is fine (because the content always is the same). - * This is important to support divide_type: If the pass that has a - * divide_type is added first, a pass for divide_type with an empty - * name will be added. Then, if a matching pass with a name is later - * requested, the existing placeholder will be renamed to that. - * If the divide_type is explicitly allocated with a name first and - * then again as part of another pass, the second one will just be - * skipped because that type already exists. */ - - /* If no name is specified, any pass of the correct type will match. */ - if (name == NULL) { - return; - } - - /* If we already have a placeholder pass, rename that one. */ - if (passes[i].name.empty()) { - passes[i].name = name; - return; - } - - /* If neither existing nor requested pass have placeholder name, they - * must match. */ - if (name == passes[i].name) { - return; - } - } - - Pass pass; - - pass.type = type; - pass.filter = true; - pass.exposure = false; - pass.divide_type = PASS_NONE; - if (name) { - pass.name = name; - } - - switch (type) { - case PASS_NONE: - pass.components = 0; - break; - case PASS_COMBINED: - pass.components = 4; - pass.exposure = true; - break; - case PASS_DEPTH: - pass.components = 1; - pass.filter = false; - break; - case PASS_MIST: - pass.components = 1; - break; - case PASS_NORMAL: - pass.components = 4; - break; - case PASS_UV: - pass.components = 4; - break; - case PASS_MOTION: - pass.components = 4; - pass.divide_type = PASS_MOTION_WEIGHT; - break; - case PASS_MOTION_WEIGHT: - pass.components = 1; - break; - case PASS_OBJECT_ID: - case PASS_MATERIAL_ID: - pass.components = 1; - pass.filter = false; - break; - - case PASS_EMISSION: - case PASS_BACKGROUND: - pass.components = 4; - pass.exposure = true; - break; - case PASS_AO: - pass.components = 4; - break; - case PASS_SHADOW: - pass.components = 4; - pass.exposure = false; - break; - case PASS_LIGHT: - /* This isn't a real pass, used by baking to see whether - * light data is needed or not. - * - * Set components to 0 so pass sort below happens in a - * determined way. - */ - pass.components = 0; - break; - case PASS_RENDER_TIME: - /* This pass is handled entirely on the host side. */ - pass.components = 0; - break; - - case PASS_DIFFUSE_COLOR: - case PASS_GLOSSY_COLOR: - case PASS_TRANSMISSION_COLOR: - pass.components = 4; - break; - case PASS_DIFFUSE_DIRECT: - case PASS_DIFFUSE_INDIRECT: - pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_DIFFUSE_COLOR; - break; - case PASS_GLOSSY_DIRECT: - case PASS_GLOSSY_INDIRECT: - pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_GLOSSY_COLOR; - break; - case PASS_TRANSMISSION_DIRECT: - case PASS_TRANSMISSION_INDIRECT: - pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_TRANSMISSION_COLOR; - break; - case PASS_VOLUME_DIRECT: - case PASS_VOLUME_INDIRECT: - pass.components = 4; - pass.exposure = true; - break; - case PASS_CRYPTOMATTE: - pass.components = 4; - break; - case PASS_ADAPTIVE_AUX_BUFFER: - pass.components = 4; - break; - case PASS_SAMPLE_COUNT: - pass.components = 1; - pass.exposure = false; - break; - case PASS_AOV_COLOR: - pass.components = 4; - break; - case PASS_AOV_VALUE: - pass.components = 1; - break; - case PASS_BAKE_PRIMITIVE: - case PASS_BAKE_DIFFERENTIAL: - pass.components = 4; - pass.exposure = false; - pass.filter = false; - break; - default: - assert(false); - break; - } - - passes.push_back(pass); - - /* Order from by components, to ensure alignment so passes with size 4 - * come first and then passes with size 1. Note this must use stable sort - * so cryptomatte passes remain in the right order. */ - stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order); - - if (pass.divide_type != PASS_NONE) - Pass::add(pass.divide_type, passes); -} - -bool Pass::equals(const vector<Pass> &A, const vector<Pass> &B) -{ - if (A.size() != B.size()) - return false; - - for (int i = 0; i < A.size(); i++) - if (A[i].type != B[i].type || A[i].name != B[i].name) - return false; - - return true; -} - -bool Pass::contains(const vector<Pass> &passes, PassType type) -{ - for (size_t i = 0; i < passes.size(); i++) - if (passes[i].type == type) - return true; - - return false; -} - /* Pixel Filter */ static float filter_func_box(float /*v*/, float /*width*/) @@ -368,17 +116,11 @@ NODE_DEFINE(Film) SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f); SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f); - SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); - SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); - SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false); - SOCKET_INT(denoising_flags, "Denoising Flags", 0); - SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false); - - SOCKET_BOOLEAN(use_light_visibility, "Use Light Visibility", false); - - NodeEnum *pass_type_enum = get_pass_type_enum(); + const NodeEnum *pass_type_enum = Pass::get_type_enum(); SOCKET_ENUM(display_pass, "Display Pass", *pass_type_enum, PASS_COMBINED); + SOCKET_BOOLEAN(show_active_pixels, "Show Active Pixels", false); + static NodeEnum cryptomatte_passes_enum; cryptomatte_passes_enum.insert("none", CRYPT_NONE); cryptomatte_passes_enum.insert("object", CRYPT_OBJECT); @@ -389,15 +131,13 @@ NODE_DEFINE(Film) SOCKET_INT(cryptomatte_depth, "Cryptomatte Depth", 0); + SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false); + return type; } -Film::Film() : Node(get_node_type()) +Film::Film() : Node(get_node_type()), filter_table_offset_(TABLE_OFFSET_INVALID) { - use_light_visibility = false; - filter_table_offset = TABLE_OFFSET_INVALID; - cryptomatte_passes = CRYPT_NONE; - display_pass = PASS_COMBINED; } Film::~Film() @@ -406,7 +146,8 @@ Film::~Film() void Film::add_default(Scene *scene) { - Pass::add(PASS_COMBINED, scene->passes); + Pass *pass = scene->create_node<Pass>(); + pass->set_type(PASS_COMBINED); } void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) @@ -426,50 +167,77 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) /* update __data */ kfilm->exposure = exposure; + kfilm->pass_alpha_threshold = pass_alpha_threshold; kfilm->pass_flag = 0; - kfilm->display_pass_stride = -1; - kfilm->display_pass_components = 0; - kfilm->display_divide_pass_stride = -1; - kfilm->use_display_exposure = false; - kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED); + kfilm->use_approximate_shadow_catcher = get_use_approximate_shadow_catcher(); kfilm->light_pass_flag = 0; kfilm->pass_stride = 0; - kfilm->use_light_pass = use_light_visibility; - kfilm->pass_aov_value_num = 0; - kfilm->pass_aov_color_num = 0; + + /* Mark with PASS_UNUSED to avoid mask test in the kernel. */ + kfilm->pass_background = PASS_UNUSED; + kfilm->pass_emission = PASS_UNUSED; + kfilm->pass_ao = PASS_UNUSED; + kfilm->pass_diffuse_direct = PASS_UNUSED; + kfilm->pass_diffuse_indirect = PASS_UNUSED; + kfilm->pass_glossy_direct = PASS_UNUSED; + kfilm->pass_glossy_indirect = PASS_UNUSED; + kfilm->pass_transmission_direct = PASS_UNUSED; + kfilm->pass_transmission_indirect = PASS_UNUSED; + kfilm->pass_volume_direct = PASS_UNUSED; + kfilm->pass_volume_indirect = PASS_UNUSED; + kfilm->pass_volume_direct = PASS_UNUSED; + kfilm->pass_volume_indirect = PASS_UNUSED; + kfilm->pass_shadow = PASS_UNUSED; + + /* Mark passes as unused so that the kernel knows the pass is inaccessible. */ + kfilm->pass_denoising_normal = PASS_UNUSED; + kfilm->pass_denoising_albedo = PASS_UNUSED; + kfilm->pass_sample_count = PASS_UNUSED; + kfilm->pass_adaptive_aux_buffer = PASS_UNUSED; + kfilm->pass_shadow_catcher = PASS_UNUSED; + kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED; + kfilm->pass_shadow_catcher_matte = PASS_UNUSED; bool have_cryptomatte = false; + bool have_aov_color = false; + bool have_aov_value = false; for (size_t i = 0; i < scene->passes.size(); i++) { - Pass &pass = scene->passes[i]; + const Pass *pass = scene->passes[i]; - if (pass.type == PASS_NONE) { + if (pass->get_type() == PASS_NONE || !pass->is_written()) { + continue; + } + + if (pass->get_mode() == PassMode::DENOISED) { + /* Generally we only storing offsets of the noisy passes. The display pass is an exception + * since it is a read operation and not a write. */ + kfilm->pass_stride += pass->get_info().num_components; continue; } /* Can't do motion pass if no motion vectors are available. */ - if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) { + if (pass->get_type() == PASS_MOTION || pass->get_type() == PASS_MOTION_WEIGHT) { if (scene->need_motion() != Scene::MOTION_PASS) { - kfilm->pass_stride += pass.components; + kfilm->pass_stride += pass->get_info().num_components; continue; } } - int pass_flag = (1 << (pass.type % 32)); - if (pass.type <= PASS_CATEGORY_MAIN_END) { - kfilm->pass_flag |= pass_flag; - } - else if (pass.type <= PASS_CATEGORY_LIGHT_END) { - kfilm->use_light_pass = 1; + const int pass_flag = (1 << (pass->get_type() % 32)); + if (pass->get_type() <= PASS_CATEGORY_LIGHT_END) { kfilm->light_pass_flag |= pass_flag; } + else if (pass->get_type() <= PASS_CATEGORY_DATA_END) { + kfilm->pass_flag |= pass_flag; + } else { - assert(pass.type <= PASS_CATEGORY_BAKE_END); + assert(pass->get_type() <= PASS_CATEGORY_BAKE_END); } - switch (pass.type) { + switch (pass->get_type()) { case PASS_COMBINED: kfilm->pass_combined = kfilm->pass_stride; break; @@ -479,6 +247,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) case PASS_NORMAL: kfilm->pass_normal = kfilm->pass_stride; break; + case PASS_POSITION: + kfilm->pass_position = kfilm->pass_stride; + break; + case PASS_ROUGHNESS: + kfilm->pass_roughness = kfilm->pass_stride; + break; case PASS_UV: kfilm->pass_uv = kfilm->pass_stride; break; @@ -511,9 +285,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_shadow = kfilm->pass_stride; break; - case PASS_LIGHT: - break; - case PASS_DIFFUSE_COLOR: kfilm->pass_diffuse_color = kfilm->pass_stride; break; @@ -563,78 +334,56 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride; have_cryptomatte = true; break; + + case PASS_DENOISING_NORMAL: + kfilm->pass_denoising_normal = kfilm->pass_stride; + break; + case PASS_DENOISING_ALBEDO: + kfilm->pass_denoising_albedo = kfilm->pass_stride; + break; + + case PASS_SHADOW_CATCHER: + kfilm->pass_shadow_catcher = kfilm->pass_stride; + break; + case PASS_SHADOW_CATCHER_SAMPLE_COUNT: + kfilm->pass_shadow_catcher_sample_count = kfilm->pass_stride; + break; + case PASS_SHADOW_CATCHER_MATTE: + kfilm->pass_shadow_catcher_matte = kfilm->pass_stride; + break; + case PASS_ADAPTIVE_AUX_BUFFER: kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride; break; case PASS_SAMPLE_COUNT: kfilm->pass_sample_count = kfilm->pass_stride; break; + case PASS_AOV_COLOR: - if (kfilm->pass_aov_color_num == 0) { + if (!have_aov_color) { kfilm->pass_aov_color = kfilm->pass_stride; + have_aov_color = true; } - kfilm->pass_aov_color_num++; break; case PASS_AOV_VALUE: - if (kfilm->pass_aov_value_num == 0) { + if (!have_aov_value) { kfilm->pass_aov_value = kfilm->pass_stride; + have_aov_value = true; } - kfilm->pass_aov_value_num++; break; default: assert(false); break; } - if (pass.type == display_pass) { - kfilm->display_pass_stride = kfilm->pass_stride; - kfilm->display_pass_components = pass.components; - kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f); - } - else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR || - pass.type == PASS_GLOSSY_COLOR) { - kfilm->display_divide_pass_stride = kfilm->pass_stride; - } - - kfilm->pass_stride += pass.components; - } - - kfilm->pass_denoising_data = 0; - kfilm->pass_denoising_clean = 0; - kfilm->denoising_flags = 0; - if (denoising_data_pass) { - kfilm->pass_denoising_data = kfilm->pass_stride; - kfilm->pass_stride += DENOISING_PASS_SIZE_BASE; - kfilm->denoising_flags = denoising_flags; - if (denoising_clean_pass) { - kfilm->pass_denoising_clean = kfilm->pass_stride; - kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; - kfilm->use_light_pass = 1; - } - if (denoising_prefiltered_pass) { - kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED; - } - } - - kfilm->pass_stride = align_up(kfilm->pass_stride, 4); - - /* When displaying the normal/uv pass in the viewport we need to disable - * transparency. - * - * We also don't need to perform light accumulations. Later we want to optimize this to suppress - * light calculations. */ - if (display_pass == PASS_NORMAL || display_pass == PASS_UV) { - kfilm->use_light_pass = 0; - } - else { - kfilm->pass_alpha_threshold = pass_alpha_threshold; + kfilm->pass_stride += pass->get_info().num_components; } /* update filter table */ vector<float> table = filter_table(filter_type, filter_width); - scene->lookup_tables->remove_table(&filter_table_offset); - filter_table_offset = scene->lookup_tables->add_table(dscene, table); - kfilm->filter_table_offset = (int)filter_table_offset; + scene->lookup_tables->remove_table(&filter_table_offset_); + filter_table_offset_ = scene->lookup_tables->add_table(dscene, table); + kfilm->filter_table_offset = (int)filter_table_offset_; /* mist pass parameters */ kfilm->mist_start = mist_start; @@ -644,79 +393,298 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->cryptomatte_passes = cryptomatte_passes; kfilm->cryptomatte_depth = cryptomatte_depth; - pass_stride = kfilm->pass_stride; - denoising_data_offset = kfilm->pass_denoising_data; - denoising_clean_offset = kfilm->pass_denoising_clean; - clear_modified(); } void Film::device_free(Device * /*device*/, DeviceScene * /*dscene*/, Scene *scene) { - scene->lookup_tables->remove_table(&filter_table_offset); + scene->lookup_tables->remove_table(&filter_table_offset_); } -void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes) +int Film::get_aov_offset(Scene *scene, string name, bool &is_color) { - if (Pass::contains(scene->passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) { - scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED); + int offset_color = 0, offset_value = 0; + foreach (const Pass *pass, scene->passes) { + if (pass->get_name() == name) { + if (pass->get_type() == PASS_AOV_VALUE) { + is_color = false; + return offset_value; + } + else if (pass->get_type() == PASS_AOV_COLOR) { + is_color = true; + return offset_color; + } + } + + if (pass->get_type() == PASS_AOV_VALUE) { + offset_value += pass->get_info().num_components; + } + else if (pass->get_type() == PASS_AOV_COLOR) { + offset_color += pass->get_info().num_components; + } + } + + return -1; +} + +void Film::update_passes(Scene *scene, bool add_sample_count_pass) +{ + const Background *background = scene->background; + const BakeManager *bake_manager = scene->bake_manager; + const ObjectManager *object_manager = scene->object_manager; + Integrator *integrator = scene->integrator; + + if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) { + return; + } + + /* Remove auto generated passes and recreate them. */ + remove_auto_passes(scene); + + /* Display pass for viewport. */ + const PassType display_pass = get_display_pass(); + add_auto_pass(scene, display_pass); + + /* Assumption is that a combined pass always exists for now, for example + * adaptive sampling is always based on a combined pass. But we should + * try to lift this limitation in the future for faster rendering of + * individual passes. */ + if (display_pass != PASS_COMBINED) { + add_auto_pass(scene, PASS_COMBINED); + } + + /* Create passes needed for adaptive sampling. */ + const AdaptiveSampling adaptive_sampling = integrator->get_adaptive_sampling(); + if (adaptive_sampling.use) { + add_auto_pass(scene, PASS_SAMPLE_COUNT); + add_auto_pass(scene, PASS_ADAPTIVE_AUX_BUFFER); + } + + /* Create passes needed for denoising. */ + const bool use_denoise = integrator->get_use_denoise(); + if (use_denoise) { + if (integrator->get_use_denoise_pass_normal()) { + add_auto_pass(scene, PASS_DENOISING_NORMAL); + } + if (integrator->get_use_denoise_pass_albedo()) { + add_auto_pass(scene, PASS_DENOISING_ALBEDO); + } + } + + /* Create passes for shadow catcher. */ + if (scene->has_shadow_catcher()) { + const bool need_background = get_use_approximate_shadow_catcher() && + !background->get_transparent(); + + add_auto_pass(scene, PASS_SHADOW_CATCHER); + add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT); + add_auto_pass(scene, PASS_SHADOW_CATCHER_MATTE); + + if (need_background) { + add_auto_pass(scene, PASS_BACKGROUND); + } + } + else if (Pass::contains(scene->passes, PASS_SHADOW_CATCHER)) { + add_auto_pass(scene, PASS_SHADOW_CATCHER); + add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT); + } + + const vector<Pass *> passes_immutable = scene->passes; + for (const Pass *pass : passes_immutable) { + const PassInfo info = pass->get_info(); + /* Add utility passes needed to generate some light passes. */ + if (info.divide_type != PASS_NONE) { + add_auto_pass(scene, info.divide_type); + } + if (info.direct_type != PASS_NONE) { + add_auto_pass(scene, info.direct_type); + } + if (info.indirect_type != PASS_NONE) { + add_auto_pass(scene, info.indirect_type); + } + + /* NOTE: Enable all denoised passes when storage is requested. + * This way it is possible to tweak denoiser parameters later on. */ + if (info.support_denoise && use_denoise) { + add_auto_pass(scene, pass->get_type(), PassMode::DENOISED); + } + } + + if (bake_manager->get_baking()) { + add_auto_pass(scene, PASS_BAKE_PRIMITIVE, "BakePrimitive"); + add_auto_pass(scene, PASS_BAKE_DIFFERENTIAL, "BakeDifferential"); + } + + if (add_sample_count_pass) { + if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) { + add_auto_pass(scene, PASS_SAMPLE_COUNT); + } + } + + /* Remove duplicates and initialize internal pass info. */ + finalize_passes(scene, use_denoise); + /* Flush scene updates. */ + const bool have_uv_pass = Pass::contains(scene->passes, PASS_UV); + const bool have_motion_pass = Pass::contains(scene->passes, PASS_MOTION); + const bool have_ao_pass = Pass::contains(scene->passes, PASS_AO); + + if (have_uv_pass != prev_have_uv_pass) { + scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED); foreach (Shader *shader, scene->shaders) shader->need_update_uvs = true; } - else if (Pass::contains(scene->passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) { + if (have_motion_pass != prev_have_motion_pass) { scene->geometry_manager->tag_update(scene, GeometryManager::MOTION_PASS_NEEDED); } - else if (Pass::contains(scene->passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) { + if (have_ao_pass != prev_have_ao_pass) { scene->integrator->tag_update(scene, Integrator::AO_PASS_MODIFIED); } - if (update_passes) { - scene->passes = passes_; + prev_have_uv_pass = have_uv_pass; + prev_have_motion_pass = have_motion_pass; + prev_have_ao_pass = have_ao_pass; + + tag_modified(); + + /* Debug logging. */ + if (VLOG_IS_ON(2)) { + VLOG(2) << "Effective scene passes:"; + for (const Pass *pass : scene->passes) { + VLOG(2) << "- " << *pass; + } } } -int Film::get_aov_offset(Scene *scene, string name, bool &is_color) +void Film::add_auto_pass(Scene *scene, PassType type, const char *name) { - int num_color = 0, num_value = 0; - foreach (const Pass &pass, scene->passes) { - if (pass.type == PASS_AOV_COLOR) { - num_color++; - } - else if (pass.type == PASS_AOV_VALUE) { - num_value++; + add_auto_pass(scene, type, PassMode::NOISY, name); +} + +void Film::add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name) +{ + Pass *pass = new Pass(); + pass->set_type(type); + pass->set_mode(mode); + pass->set_name(ustring((name) ? name : "")); + pass->is_auto_ = true; + + pass->set_owner(scene); + scene->passes.push_back(pass); +} + +void Film::remove_auto_passes(Scene *scene) +{ + /* Remove all passes which were automatically created. */ + vector<Pass *> new_passes; + + for (Pass *pass : scene->passes) { + if (!pass->is_auto_) { + new_passes.push_back(pass); } else { - continue; - } - - if (pass.name == name) { - is_color = (pass.type == PASS_AOV_COLOR); - return (is_color ? num_color : num_value) - 1; + delete pass; } } - return -1; + scene->passes = new_passes; } -int Film::get_pass_stride() const +static bool compare_pass_order(const Pass *a, const Pass *b) { - return pass_stride; -} + const int num_components_a = a->get_info().num_components; + const int num_components_b = b->get_info().num_components; -int Film::get_denoising_data_offset() const -{ - return denoising_data_offset; + if (num_components_a == num_components_b) { + return (a->get_type() < b->get_type()); + } + + return num_components_a > num_components_b; } -int Film::get_denoising_clean_offset() const +void Film::finalize_passes(Scene *scene, const bool use_denoise) { - return denoising_clean_offset; + /* Remove duplicate passes. */ + vector<Pass *> new_passes; + + for (Pass *pass : scene->passes) { + /* Disable denoising on passes if denoising is disabled, or if the + * pass does not support it. */ + pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() : + PassMode::NOISY); + + /* Merge duplicate passes. */ + bool duplicate_found = false; + for (Pass *new_pass : new_passes) { + /* If different type or denoising, don't merge. */ + if (new_pass->get_type() != pass->get_type() || new_pass->get_mode() != pass->get_mode()) { + continue; + } + + /* If both passes have a name and the names are different, don't merge. + * If either pass has a name, we'll use that name. */ + if (!pass->get_name().empty() && !new_pass->get_name().empty() && + pass->get_name() != new_pass->get_name()) { + continue; + } + + if (!pass->get_name().empty() && new_pass->get_name().empty()) { + new_pass->set_name(pass->get_name()); + } + + new_pass->is_auto_ &= pass->is_auto_; + duplicate_found = true; + break; + } + + if (!duplicate_found) { + new_passes.push_back(pass); + } + else { + delete pass; + } + } + + /* Order from by components and type, This is required to for AOVs and cryptomatte passes, + * which the kernel assumes to be in order. Note this must use stable sort so cryptomatte + * passes remain in the right order. */ + stable_sort(new_passes.begin(), new_passes.end(), compare_pass_order); + + scene->passes = new_passes; } -size_t Film::get_filter_table_offset() const +uint Film::get_kernel_features(const Scene *scene) const { - return filter_table_offset; + uint kernel_features = 0; + + for (const Pass *pass : scene->passes) { + if (!pass->is_written()) { + continue; + } + + const PassType pass_type = pass->get_type(); + const PassMode pass_mode = pass->get_mode(); + + if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL || + pass_type == PASS_DENOISING_ALBEDO) { + kernel_features |= KERNEL_FEATURE_DENOISING; + } + + if (pass_type != PASS_NONE && pass_type != PASS_COMBINED && + pass_type <= PASS_CATEGORY_LIGHT_END) { + kernel_features |= KERNEL_FEATURE_LIGHT_PASSES; + + if (pass_type == PASS_SHADOW) { + kernel_features |= KERNEL_FEATURE_SHADOW_PASS; + } + } + + if (pass_type == PASS_AO) { + kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE; + } + } + + return kernel_features; } CCL_NAMESPACE_END diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 462a7275491..5d327353361 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -17,6 +17,7 @@ #ifndef __FILM_H__ #define __FILM_H__ +#include "render/pass.h" #include "util/util_string.h" #include "util/util_vector.h" @@ -38,36 +39,15 @@ typedef enum FilterType { FILTER_NUM_TYPES, } FilterType; -class Pass : public Node { - public: - NODE_DECLARE - - Pass(); - - PassType type; - int components; - bool filter; - bool exposure; - PassType divide_type; - ustring name; - - static void add(PassType type, vector<Pass> &passes, const char *name = NULL); - static bool equals(const vector<Pass> &A, const vector<Pass> &B); - static bool contains(const vector<Pass> &passes, PassType); -}; - class Film : public Node { public: NODE_DECLARE NODE_SOCKET_API(float, exposure) - NODE_SOCKET_API(bool, denoising_data_pass) - NODE_SOCKET_API(bool, denoising_clean_pass) - NODE_SOCKET_API(bool, denoising_prefiltered_pass) - NODE_SOCKET_API(int, denoising_flags) NODE_SOCKET_API(float, pass_alpha_threshold) NODE_SOCKET_API(PassType, display_pass) + NODE_SOCKET_API(bool, show_active_pixels) NODE_SOCKET_API(FilterType, filter_type) NODE_SOCKET_API(float, filter_width) @@ -76,17 +56,18 @@ class Film : public Node { NODE_SOCKET_API(float, mist_depth) NODE_SOCKET_API(float, mist_falloff) - NODE_SOCKET_API(bool, use_light_visibility) NODE_SOCKET_API(CryptomatteType, cryptomatte_passes) NODE_SOCKET_API(int, cryptomatte_depth) - NODE_SOCKET_API(bool, use_adaptive_sampling) + /* Approximate shadow catcher pass into its matte pass, so that both artificial objects and + * shadows can be alpha-overed onto a backdrop. */ + NODE_SOCKET_API(bool, use_approximate_shadow_catcher) private: - int pass_stride; - int denoising_data_offset; - int denoising_clean_offset; - size_t filter_table_offset; + size_t filter_table_offset_; + bool prev_have_uv_pass = false; + bool prev_have_motion_pass = false; + bool prev_have_ao_pass = false; public: Film(); @@ -98,14 +79,20 @@ class Film : public Node { void device_update(Device *device, DeviceScene *dscene, Scene *scene); void device_free(Device *device, DeviceScene *dscene, Scene *scene); - void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true); - int get_aov_offset(Scene *scene, string name, bool &is_color); - int get_pass_stride() const; - int get_denoising_data_offset() const; - int get_denoising_clean_offset() const; - size_t get_filter_table_offset() const; + /* Update passes so that they contain all passes required for the configured functionality. + * + * If `add_sample_count_pass` is true then the SAMPLE_COUNT pass is ensured to be added. */ + void update_passes(Scene *scene, bool add_sample_count_pass); + + uint get_kernel_features(const Scene *scene) const; + + private: + void add_auto_pass(Scene *scene, PassType type, const char *name = nullptr); + void add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name = nullptr); + void remove_auto_passes(Scene *scene); + void finalize_passes(Scene *scene, const bool use_denoise); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp index 7ec1d2d9abb..6804a006fe6 100644 --- a/intern/cycles/render/geometry.cpp +++ b/intern/cycles/render/geometry.cpp @@ -215,6 +215,12 @@ void Geometry::compute_bvh( msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total); Object object; + + /* Ensure all visibility bits are set at the geometry level BVH. In + * the object level BVH is where actual visibility is tested. */ + object.set_is_shadow_catcher(true); + object.set_visibility(~0); + object.set_geometry(this); vector<Geometry *> geometry; @@ -315,7 +321,7 @@ void GeometryManager::update_osl_attributes(Device *device, { #ifdef WITH_OSL /* for OSL, a hash map is used to lookup the attribute by name. */ - OSLGlobals *og = (OSLGlobals *)device->osl_memory(); + OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); og->object_name_map.clear(); og->attribute_map.clear(); @@ -1855,8 +1861,8 @@ void GeometryManager::device_update(Device *device, }); Camera *dicing_camera = scene->dicing_camera; - dicing_camera->set_screen_size_and_resolution( - dicing_camera->get_full_width(), dicing_camera->get_full_height(), 1); + dicing_camera->set_screen_size(dicing_camera->get_full_width(), + dicing_camera->get_full_height()); dicing_camera->update(scene); size_t i = 0; @@ -2157,7 +2163,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE; #ifdef WITH_OSL - OSLGlobals *og = (OSLGlobals *)device->osl_memory(); + OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); if (og) { og->object_name_map.clear(); diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/render/gpu_display.cpp new file mode 100644 index 00000000000..a8f0cc50583 --- /dev/null +++ b/intern/cycles/render/gpu_display.cpp @@ -0,0 +1,227 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/gpu_display.h" + +#include "render/buffers.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +void GPUDisplay::reset(const BufferParams &buffer_params) +{ + thread_scoped_lock lock(mutex_); + + const GPUDisplayParams old_params = params_; + + params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y); + params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height); + params_.size = make_int2(buffer_params.width, buffer_params.height); + + /* If the parameters did change tag texture as unusable. This avoids drawing old texture content + * in an updated configuration of the viewport. For example, avoids drawing old frame when render + * border did change. + * If the parameters did not change, allow drawing the current state of the texture, which will + * not count as an up-to-date redraw. This will avoid flickering when doping camera navigation by + * showing a previously rendered frame for until the new one is ready. */ + if (old_params.modified(params_)) { + texture_state_.is_usable = false; + } + + texture_state_.is_outdated = true; +} + +void GPUDisplay::mark_texture_updated() +{ + texture_state_.is_outdated = false; + texture_state_.is_usable = true; +} + +/* -------------------------------------------------------------------- + * Update procedure. + */ + +bool GPUDisplay::update_begin(int texture_width, int texture_height) +{ + DCHECK(!update_state_.is_active); + + if (update_state_.is_active) { + LOG(ERROR) << "Attempt to re-activate update process."; + return false; + } + + /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time. + * The update itself is non-blocking however, for better performance and to avoid + * potential deadlocks due to locks held by the subclass. */ + GPUDisplayParams params; + { + thread_scoped_lock lock(mutex_); + params = params_; + texture_state_.size = make_int2(texture_width, texture_height); + } + + if (!do_update_begin(params, texture_width, texture_height)) { + LOG(ERROR) << "GPUDisplay implementation could not begin update."; + return false; + } + + update_state_.is_active = true; + + return true; +} + +void GPUDisplay::update_end() +{ + DCHECK(update_state_.is_active); + + if (!update_state_.is_active) { + LOG(ERROR) << "Attempt to deactivate inactive update process."; + return; + } + + do_update_end(); + + update_state_.is_active = false; +} + +int2 GPUDisplay::get_texture_size() const +{ + return texture_state_.size; +} + +/* -------------------------------------------------------------------- + * Texture update from CPU buffer. + */ + +void GPUDisplay::copy_pixels_to_texture( + const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height) +{ + DCHECK(update_state_.is_active); + + if (!update_state_.is_active) { + LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update."; + return; + } + + mark_texture_updated(); + do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height); +} + +/* -------------------------------------------------------------------- + * Texture buffer mapping. + */ + +half4 *GPUDisplay::map_texture_buffer() +{ + DCHECK(!texture_buffer_state_.is_mapped); + DCHECK(update_state_.is_active); + + if (texture_buffer_state_.is_mapped) { + LOG(ERROR) << "Attempt to re-map an already mapped texture buffer."; + return nullptr; + } + + if (!update_state_.is_active) { + LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update."; + return nullptr; + } + + half4 *mapped_rgba_pixels = do_map_texture_buffer(); + + if (mapped_rgba_pixels) { + texture_buffer_state_.is_mapped = true; + } + + return mapped_rgba_pixels; +} + +void GPUDisplay::unmap_texture_buffer() +{ + DCHECK(texture_buffer_state_.is_mapped); + + if (!texture_buffer_state_.is_mapped) { + LOG(ERROR) << "Attempt to unmap non-mapped texture buffer."; + return; + } + + texture_buffer_state_.is_mapped = false; + + mark_texture_updated(); + do_unmap_texture_buffer(); +} + +/* -------------------------------------------------------------------- + * Graphics interoperability. + */ + +DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get() +{ + DCHECK(!texture_buffer_state_.is_mapped); + DCHECK(update_state_.is_active); + + if (texture_buffer_state_.is_mapped) { + LOG(ERROR) + << "Attempt to use graphics interoperability mode while the texture buffer is mapped."; + return DeviceGraphicsInteropDestination(); + } + + if (!update_state_.is_active) { + LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update."; + return DeviceGraphicsInteropDestination(); + } + + /* Assume that interop will write new values to the texture. */ + mark_texture_updated(); + + return do_graphics_interop_get(); +} + +void GPUDisplay::graphics_interop_activate() +{ +} + +void GPUDisplay::graphics_interop_deactivate() +{ +} + +/* -------------------------------------------------------------------- + * Drawing. + */ + +bool GPUDisplay::draw() +{ + /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time. + * The drawing itself is non-blocking however, for better performance and to avoid + * potential deadlocks due to locks held by the subclass. */ + GPUDisplayParams params; + bool is_usable; + bool is_outdated; + + { + thread_scoped_lock lock(mutex_); + params = params_; + is_usable = texture_state_.is_usable; + is_outdated = texture_state_.is_outdated; + } + + if (is_usable) { + do_draw(params); + } + + return !is_outdated; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/render/gpu_display.h new file mode 100644 index 00000000000..cbe347895a1 --- /dev/null +++ b/intern/cycles/render/gpu_display.h @@ -0,0 +1,247 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device_graphics_interop.h" +#include "util/util_half.h" +#include "util/util_thread.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +class BufferParams; + +/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored + * in a GPU-side texture, which is updated from a path tracer and drawn by an application. + * + * The base GPUDisplay does some special texture state tracking, which allows render Session to + * make decisions on whether reset for an updated state is possible or not. This state should only + * be tracked in a base class and a particular implementation should not worry about it. + * + * The subclasses should only implement the pure virtual methods, which allows them to not worry + * about parent method calls, which helps them to be as small and reliable as possible. */ + +class GPUDisplayParams { + public: + /* Offset of the display within a viewport. + * For example, set to a lower-bottom corner of border render in Blender's viewport. */ + int2 offset = make_int2(0, 0); + + /* Full viewport size. + * + * NOTE: Is not affected by the resolution divider. */ + int2 full_size = make_int2(0, 0); + + /* Effective vieport size. + * In the case of border render, size of the border rectangle. + * + * NOTE: Is not affected by the resolution divider. */ + int2 size = make_int2(0, 0); + + bool modified(const GPUDisplayParams &other) const + { + return !(offset == other.offset && full_size == other.full_size && size == other.size); + } +}; + +class GPUDisplay { + public: + GPUDisplay() = default; + virtual ~GPUDisplay() = default; + + /* Reset the display for the new state of render session. Is called whenever session is reset, + * which happens on changes like viewport navigation or viewport dimension change. + * + * This call will configure parameters for a changed buffer and reset the texture state. */ + void reset(const BufferParams &buffer_params); + + const GPUDisplayParams &get_params() const + { + return params_; + } + + /* -------------------------------------------------------------------- + * Update procedure. + * + * These calls indicates a desire of the caller to update content of the displayed texture. */ + + /* Returns true when update is ready. Update should be finished with update_end(). + * + * If false is returned then no update is possible, and no update_end() call is needed. + * + * The texture width and height denotes an actual resolution of the underlying render result. */ + bool update_begin(int texture_width, int texture_height); + + void update_end(); + + /* Get currently configured texture size of the display (as configured by `update_begin()`. */ + int2 get_texture_size() const; + + /* -------------------------------------------------------------------- + * Texture update from CPU buffer. + * + * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`. + * + * Most portable implementation, which must be supported by all platforms. Might not be the most + * efficient one. + */ + + /* Copy buffer of rendered pixels of a given size into a given position of the texture. + * + * This function does not acquire a lock. The reason for this is is to allow use of this function + * for partial updates from different devices. In this case the caller will acquire the lock + * once, update all the slices and release + * the lock once. This will ensure that draw() will never use partially updated texture. */ + void copy_pixels_to_texture( + const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height); + + /* -------------------------------------------------------------------- + * Texture buffer mapping. + * + * This functionality is used to update GPU-side texture content without need to maintain CPU + * side buffer on the caller. + * + * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`. + * + * NOTE: Texture buffer can not be mapped while graphics interopeability is active. This means + * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and + * `graphics_interop_end()` calls. + */ + + /* Map pixels memory form texture to a buffer available for write from CPU. Width and height will + * define a requested size of the texture to write to. + * Upon success a non-null pointer is returned and the texture buffer is to be unmapped. + * If an error happens during mapping, or if mapoping is not supported by this GPU display a + * null pointer is returned and the buffer is NOT to be unmapped. + * + * NOTE: Usually the implementation will rely on a GPU context of some sort, and the GPU context + * is often can not be bound to two threads simultaneously, and can not be released from a + * different thread. This means that the mapping API should be used from the single thread only, + */ + half4 *map_texture_buffer(); + void unmap_texture_buffer(); + + /* -------------------------------------------------------------------- + * Graphics interoperability. + * + * A special code path which allows to update texture content directly from the GPU compute + * device. Complementary part of DeviceGraphicsInterop. + * + * NOTE: Graphics interoperability can not be used while the texture buffer is mapped. This means + * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and + * `unmap_texture_buffer()` calls. */ + + /* Get GPUDisplay graphics interoperability information which acts as a destination for the + * device API. */ + DeviceGraphicsInteropDestination graphics_interop_get(); + + /* (De)activate GPU display for graphics interoperability outside of regular display udpate + * routines. */ + virtual void graphics_interop_activate(); + virtual void graphics_interop_deactivate(); + + /* -------------------------------------------------------------------- + * Drawing. + */ + + /* Clear the texture by filling it with all zeroes. + * + * This call might happen in parallel with draw, but can never happen in parallel with the + * update. + * + * The actual zero-ing can be deferred to a later moment. What is important is that after clear + * and before pixels update the drawing texture will be fully empty, and that partial update + * after clear will write new pixel values for an updating area, leaving everything else zeroed. + * + * If the GPU display supports graphics interoperability then the zeroing the display is to be + * delegated to the device via the `DeviceGraphicsInteropDestination`. */ + virtual void clear() = 0; + + /* Draw the current state of the texture. + * + * Returns true if this call did draw an updated state of the texture. */ + bool draw(); + + protected: + /* Implementation-specific calls which subclasses are to implement. + * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to + * simplify their particular implementation. */ + virtual bool do_update_begin(const GPUDisplayParams ¶ms, + int texture_width, + int texture_height) = 0; + virtual void do_update_end() = 0; + + virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels, + int texture_x, + int texture_y, + int pixels_width, + int pixels_height) = 0; + + virtual half4 *do_map_texture_buffer() = 0; + virtual void do_unmap_texture_buffer() = 0; + + /* Note that this might be called in parallel to do_update_begin() and do_update_end(), + * the subclass is responsible for appropriate mutex locks to avoid multiple threads + * editing and drawing the texture at the same time. */ + virtual void do_draw(const GPUDisplayParams ¶ms) = 0; + + virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0; + + private: + thread_mutex mutex_; + GPUDisplayParams params_; + + /* Mark texture as its content has been updated. + * Used from places which knows that the texture content has been brough up-to-date, so that the + * drawing knows whether it can be performed, and whether drawing happenned with an up-to-date + * texture state. */ + void mark_texture_updated(); + + /* State of the update process. */ + struct { + /* True when update is in process, indicated by `update_begin()` / `update_end()`. */ + bool is_active = false; + } update_state_; + + /* State of the texture, which is needed for an integration with render session and interactive + * updates and navigation. */ + struct { + /* Denotes whether possibly existing state of GPU side texture is still usable. + * It will not be usable in cases like render border did change (in this case we don't want + * previous texture to be rendered at all). + * + * However, if only navigation or object in scene did change, then the outdated state of the + * texture is still usable for draw, preventing display viewport flickering on navigation and + * object modifications. */ + bool is_usable = false; + + /* Texture is considered outdated after `reset()` until the next call of + * `copy_pixels_to_texture()`. */ + bool is_outdated = true; + + /* Texture size in pixels. */ + int2 size = make_int2(0, 0); + } texture_state_; + + /* State of the texture buffer. Is tracked to perform sanity checks. */ + struct { + /* True when the texture buffer is mapped with `map_texture_buffer()`. */ + bool is_mapped = false; + } texture_buffer_state_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 5102b182593..3584754fad1 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -224,10 +224,6 @@ class ShaderNode : public Node { { return false; } - virtual bool has_raytrace() - { - return false; - } vector<ShaderInput *> inputs; vector<ShaderOutput *> outputs; @@ -242,22 +238,13 @@ class ShaderNode : public Node { * that those functions are for selective compilation only? */ - /* Nodes are split into several groups, group of level 0 contains - * nodes which are most commonly used, further levels are extension - * of previous one and includes less commonly used nodes. - */ - virtual int get_group() - { - return NODE_GROUP_LEVEL_0; - } - /* Node feature are used to disable huge nodes inside the group, * so it's possible to disable huge nodes inside of the required * nodes group. */ virtual int get_feature() { - return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP; + return bump == SHADER_BUMP_NONE ? 0 : KERNEL_FEATURE_NODE_BUMP; } /* Get closure ID to which the node compiles into. */ diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index d8749cec9fa..d74d14242bb 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -53,6 +53,8 @@ NODE_DEFINE(Integrator) SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7); SOCKET_INT(ao_bounces, "AO Bounces", 0); + SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f); + SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX); SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024); SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f); @@ -66,33 +68,39 @@ NODE_DEFINE(Integrator) SOCKET_BOOLEAN(motion_blur, "Motion Blur", false); SOCKET_INT(aa_samples, "AA Samples", 0); - SOCKET_INT(diffuse_samples, "Diffuse Samples", 1); - SOCKET_INT(glossy_samples, "Glossy Samples", 1); - SOCKET_INT(transmission_samples, "Transmission Samples", 1); - SOCKET_INT(ao_samples, "AO Samples", 1); - SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1); - SOCKET_INT(subsurface_samples, "Subsurface Samples", 1); - SOCKET_INT(volume_samples, "Volume Samples", 1); SOCKET_INT(start_sample, "Start Sample", 0); + SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false); SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f); SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0); - SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true); - SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true); SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f); - static NodeEnum method_enum; - method_enum.insert("path", PATH); - method_enum.insert("branched_path", BRANCHED_PATH); - SOCKET_ENUM(method, "Method", method_enum, PATH); - static NodeEnum sampling_pattern_enum; sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL); - sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ); sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ); SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL); + static NodeEnum denoiser_type_enum; + denoiser_type_enum.insert("optix", DENOISER_OPTIX); + denoiser_type_enum.insert("openimagedenoise", DENOISER_OPENIMAGEDENOISE); + + static NodeEnum denoiser_prefilter_enum; + denoiser_prefilter_enum.insert("none", DENOISER_PREFILTER_NONE); + denoiser_prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST); + denoiser_prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE); + + /* Default to accurate denoising with OpenImageDenoise. For interactive viewport + * it's best use OptiX and disable the normal pass since it does not always have + * the desired effect for that denoiser. */ + SOCKET_BOOLEAN(use_denoise, "Use Denoiser", false); + SOCKET_ENUM(denoiser_type, "Denoiser Type", denoiser_type_enum, DENOISER_OPENIMAGEDENOISE); + SOCKET_INT(denoise_start_sample, "Start Sample to Denoise", 0); + SOCKET_BOOLEAN(use_denoise_pass_albedo, "Use Albedo Pass for Denoiser", true); + SOCKET_BOOLEAN(use_denoise_pass_normal, "Use Normal Pass for Denoiser", true); + SOCKET_ENUM( + denoiser_prefilter, "Denoiser Type", denoiser_prefilter_enum, DENOISER_PREFILTER_ACCURATE); + return type; } @@ -115,13 +123,20 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene } }); - const bool need_update_lut = ao_samples_is_modified() || diffuse_samples_is_modified() || - glossy_samples_is_modified() || max_bounce_is_modified() || - max_transmission_bounce_is_modified() || - mesh_light_samples_is_modified() || method_is_modified() || - sampling_pattern_is_modified() || - subsurface_samples_is_modified() || - transmission_samples_is_modified() || volume_samples_is_modified(); + KernelIntegrator *kintegrator = &dscene->data.integrator; + + /* Adaptive sampling requires PMJ samples. + * + * This also makes detection of sampling pattern a bit more involved: can not rely on the changed + * state of socket, since its value might be different from the effective value used here. So + * instead compare with previous value in the KernelIntegrator. Only do it if the device was + * updated once (in which case the `sample_pattern_lut` will be allocated to a non-zero size). */ + const SamplingPattern new_sampling_pattern = (use_adaptive_sampling) ? SAMPLING_PATTERN_PMJ : + sampling_pattern; + + const bool need_update_lut = max_bounce_is_modified() || max_transmission_bounce_is_modified() || + dscene->sample_pattern_lut.size() == 0 || + kintegrator->sampling_pattern != new_sampling_pattern; if (need_update_lut) { dscene->sample_pattern_lut.tag_realloc(); @@ -129,8 +144,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene device_free(device, dscene); - KernelIntegrator *kintegrator = &dscene->data.integrator; - /* integrator parameters */ kintegrator->min_bounce = min_bounce + 1; kintegrator->max_bounce = max_bounce + 1; @@ -143,12 +156,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->transparent_min_bounce = transparent_min_bounce + 1; kintegrator->transparent_max_bounce = transparent_max_bounce + 1; - if (ao_bounces == 0) { - kintegrator->ao_bounces = INT_MAX; - } - else { - kintegrator->ao_bounces = ao_bounces - 1; - } + kintegrator->ao_bounces = ao_bounces; + kintegrator->ao_bounces_distance = ao_distance; + kintegrator->ao_bounces_factor = ao_factor; /* Transparent Shadows * We only need to enable transparent shadows, if we actually have @@ -171,10 +181,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->caustics_refractive = caustics_refractive; kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy; - kintegrator->seed = hash_uint2(seed, 0); - - kintegrator->use_ambient_occlusion = ((Pass::contains(scene->passes, PASS_AO)) || - dscene->data.background.ao_factor != 0.0f); + kintegrator->seed = seed; kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX : sample_clamp_direct * 3.0f; @@ -182,51 +189,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene FLT_MAX : sample_clamp_indirect * 3.0f; - kintegrator->branched = (method == BRANCHED_PATH) && device->info.has_branched_path; - kintegrator->volume_decoupled = device->info.has_volume_decoupled; - kintegrator->diffuse_samples = diffuse_samples; - kintegrator->glossy_samples = glossy_samples; - kintegrator->transmission_samples = transmission_samples; - kintegrator->ao_samples = ao_samples; - kintegrator->mesh_light_samples = mesh_light_samples; - kintegrator->subsurface_samples = subsurface_samples; - kintegrator->volume_samples = volume_samples; - kintegrator->start_sample = start_sample; - - if (kintegrator->branched) { - kintegrator->sample_all_lights_direct = sample_all_lights_direct; - kintegrator->sample_all_lights_indirect = sample_all_lights_indirect; - } - else { - kintegrator->sample_all_lights_direct = false; - kintegrator->sample_all_lights_indirect = false; - } - - kintegrator->sampling_pattern = sampling_pattern; - kintegrator->aa_samples = aa_samples; - if (aa_samples > 0 && adaptive_min_samples == 0) { - kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples)); - VLOG(1) << "Cycles adaptive sampling: automatic min samples = " - << kintegrator->adaptive_min_samples; - } - else { - kintegrator->adaptive_min_samples = max(4, adaptive_min_samples); - } - - kintegrator->adaptive_step = 4; - kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample; - - /* Adaptive step must be a power of two for bitwise operations to work. */ - assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0); - - if (aa_samples > 0 && adaptive_threshold == 0.0f) { - kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples); - VLOG(1) << "Cycles adaptive sampling: automatic threshold = " - << kintegrator->adaptive_threshold; - } - else { - kintegrator->adaptive_threshold = adaptive_threshold; - } + kintegrator->sampling_pattern = new_sampling_pattern; if (light_sampling_threshold > 0.0f) { kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold; @@ -236,29 +199,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene } /* sobol directions table */ - int max_samples = 1; - - if (kintegrator->branched) { - foreach (Light *light, scene->lights) - max_samples = max(max_samples, light->get_samples()); - - max_samples = max(max_samples, - max(diffuse_samples, max(glossy_samples, transmission_samples))); - max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples))); - max_samples = max(max_samples, volume_samples); - } - - uint total_bounces = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX + - max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES); - - max_samples *= total_bounces; + int max_samples = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX + + max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES); int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM; dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS); if (need_update_lut) { - if (sampling_pattern == SAMPLING_PATTERN_SOBOL) { - uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions); + if (kintegrator->sampling_pattern == SAMPLING_PATTERN_SOBOL) { + uint *directions = (uint *)dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions); sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions); @@ -276,10 +225,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j)); } pool.wait_work(); + dscene->sample_pattern_lut.copy_to_device(); } } + kintegrator->has_shadow_catcher = scene->has_shadow_catcher(); + dscene->sample_pattern_lut.clear_modified(); clear_modified(); } @@ -295,17 +247,12 @@ void Integrator::tag_update(Scene *scene, uint32_t flag) tag_modified(); } - if (flag & (AO_PASS_MODIFIED | BACKGROUND_AO_MODIFIED)) { + if (flag & AO_PASS_MODIFIED) { /* tag only the ao_bounces socket as modified so we avoid updating sample_pattern_lut * unnecessarily */ tag_ao_bounces_modified(); } - if ((flag & LIGHT_SAMPLES_MODIFIED) && (method == BRANCHED_PATH)) { - /* the number of light samples may affect the size of the sample_pattern_lut */ - tag_sampling_pattern_modified(); - } - if (filter_glossy_is_modified()) { foreach (Shader *shader, scene->shaders) { if (shader->has_integrator_dependency) { @@ -321,4 +268,65 @@ void Integrator::tag_update(Scene *scene, uint32_t flag) } } +AdaptiveSampling Integrator::get_adaptive_sampling() const +{ + AdaptiveSampling adaptive_sampling; + + adaptive_sampling.use = use_adaptive_sampling; + + if (!adaptive_sampling.use) { + return adaptive_sampling; + } + + if (aa_samples > 0 && adaptive_threshold == 0.0f) { + adaptive_sampling.threshold = max(0.001f, 1.0f / (float)aa_samples); + VLOG(1) << "Cycles adaptive sampling: automatic threshold = " << adaptive_sampling.threshold; + } + else { + adaptive_sampling.threshold = adaptive_threshold; + } + + if (adaptive_sampling.threshold > 0 && adaptive_min_samples == 0) { + /* Threshold 0.1 -> 32, 0.01 -> 64, 0.001 -> 128. + * This is highly scene dependent, we make a guess that seemed to work well + * in various test scenes. */ + const int min_samples = (int)ceilf(16.0f / powf(adaptive_sampling.threshold, 0.3f)); + adaptive_sampling.min_samples = max(4, min_samples); + VLOG(1) << "Cycles adaptive sampling: automatic min samples = " + << adaptive_sampling.min_samples; + } + else { + adaptive_sampling.min_samples = max(4, adaptive_min_samples); + } + + /* Arbitrary factor that makes the threshold more similar to what is was before, + * and gives arguably more intuitive values. */ + adaptive_sampling.threshold *= 5.0f; + + adaptive_sampling.adaptive_step = 16; + + DCHECK(is_power_of_two(adaptive_sampling.adaptive_step)) + << "Adaptive step must be a power of two for bitwise operations to work"; + + return adaptive_sampling; +} + +DenoiseParams Integrator::get_denoise_params() const +{ + DenoiseParams denoise_params; + + denoise_params.use = use_denoise; + + denoise_params.type = denoiser_type; + + denoise_params.start_sample = denoise_start_sample; + + denoise_params.use_pass_albedo = use_denoise_pass_albedo; + denoise_params.use_pass_normal = use_denoise_pass_normal; + + denoise_params.prefilter = denoiser_prefilter; + + return denoise_params; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 4eeeda92d41..32e108d62ca 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -19,7 +19,9 @@ #include "kernel/kernel_types.h" +#include "device/device_denoise.h" /* For the paramaters and type enum. */ #include "graph/node.h" +#include "integrator/adaptive_sampling.h" CCL_NAMESPACE_BEGIN @@ -43,6 +45,8 @@ class Integrator : public Node { NODE_SOCKET_API(int, transparent_max_bounce) NODE_SOCKET_API(int, ao_bounces) + NODE_SOCKET_API(float, ao_factor) + NODE_SOCKET_API(float, ao_distance) NODE_SOCKET_API(int, volume_max_steps) NODE_SOCKET_API(float, volume_step_rate) @@ -62,37 +66,26 @@ class Integrator : public Node { static const int MAX_SAMPLES = (1 << 24); NODE_SOCKET_API(int, aa_samples) - NODE_SOCKET_API(int, diffuse_samples) - NODE_SOCKET_API(int, glossy_samples) - NODE_SOCKET_API(int, transmission_samples) - NODE_SOCKET_API(int, ao_samples) - NODE_SOCKET_API(int, mesh_light_samples) - NODE_SOCKET_API(int, subsurface_samples) - NODE_SOCKET_API(int, volume_samples) NODE_SOCKET_API(int, start_sample) - NODE_SOCKET_API(bool, sample_all_lights_direct) - NODE_SOCKET_API(bool, sample_all_lights_indirect) NODE_SOCKET_API(float, light_sampling_threshold) + NODE_SOCKET_API(bool, use_adaptive_sampling) NODE_SOCKET_API(int, adaptive_min_samples) NODE_SOCKET_API(float, adaptive_threshold) - enum Method { - BRANCHED_PATH = 0, - PATH = 1, - - NUM_METHODS, - }; - - NODE_SOCKET_API(Method, method) - NODE_SOCKET_API(SamplingPattern, sampling_pattern) + NODE_SOCKET_API(bool, use_denoise); + NODE_SOCKET_API(DenoiserType, denoiser_type); + NODE_SOCKET_API(int, denoise_start_sample); + NODE_SOCKET_API(bool, use_denoise_pass_albedo); + NODE_SOCKET_API(bool, use_denoise_pass_normal); + NODE_SOCKET_API(DenoiserPrefilter, denoiser_prefilter); + enum : uint32_t { AO_PASS_MODIFIED = (1 << 0), - BACKGROUND_AO_MODIFIED = (1 << 1), - LIGHT_SAMPLES_MODIFIED = (1 << 2), + OBJECT_MANAGER = (1 << 1), /* tag everything in the manager for an update */ UPDATE_ALL = ~0u, @@ -107,6 +100,9 @@ class Integrator : public Node { void device_free(Device *device, DeviceScene *dscene, bool force_free = false); void tag_update(Scene *scene, uint32_t flag); + + AdaptiveSampling get_adaptive_sampling() const; + DenoiseParams get_denoise_params() const; }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp index fc47b0e8f0a..e31f8abd446 100644 --- a/intern/cycles/render/jitter.cpp +++ b/intern/cycles/render/jitter.cpp @@ -242,12 +242,6 @@ class PMJ02_Generator : public PMJ_Generator { static void shuffle(float2 points[], int size, int rng_seed) { - /* Offset samples by 1.0 for faster scrambling in kernel_random.h */ - for (int i = 0; i < size; ++i) { - points[i].x += 1.0f; - points[i].y += 1.0f; - } - if (rng_seed == 0) { return; } diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 15aa4e047b5..ae1150fc07b 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ -#include "render/light.h" #include "device/device.h" + #include "render/background.h" #include "render/film.h" #include "render/graph.h" #include "render/integrator.h" +#include "render/light.h" #include "render/mesh.h" #include "render/nodes.h" #include "render/object.h" @@ -27,6 +28,8 @@ #include "render/shader.h" #include "render/stats.h" +#include "integrator/shader_eval.h" + #include "util/util_foreach.h" #include "util/util_hash.h" #include "util/util_logging.h" @@ -43,63 +46,49 @@ static void shade_background_pixels(Device *device, vector<float3> &pixels, Progress &progress) { - /* create input */ - device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY); - device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE); - - uint4 *d_input_data = d_input.alloc(width * height); - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - float u = (x + 0.5f) / width; - float v = (y + 0.5f) / height; - - uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0); - d_input_data[x + y * width] = in; - } - } - - /* compute on device */ - d_output.alloc(width * height); - d_output.zero_to_device(); - d_input.copy_to_device(); - + /* Needs to be up to data for attribute access. */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - DeviceTask main_task(DeviceTask::SHADER); - main_task.shader_input = d_input.device_pointer; - main_task.shader_output = d_output.device_pointer; - main_task.shader_eval_type = SHADER_EVAL_BACKGROUND; - main_task.shader_x = 0; - main_task.shader_w = width * height; - main_task.num_samples = 1; - main_task.get_cancel = function_bind(&Progress::get_cancel, &progress); - - /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */ - list<DeviceTask> split_tasks; - main_task.split(split_tasks, 1, 128 * 128); - - foreach (DeviceTask &task, split_tasks) { - device->task_add(task); - device->task_wait(); - d_output.copy_from_device(task.shader_x, 1, task.shader_w); - } - - d_input.free(); - - float4 *d_output_data = d_output.data(); - - pixels.resize(width * height); - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - pixels[y * width + x].x = d_output_data[y * width + x].x; - pixels[y * width + x].y = d_output_data[y * width + x].y; - pixels[y * width + x].z = d_output_data[y * width + x].z; - } - } + const int size = width * height; + pixels.resize(size); + + /* Evaluate shader on device. */ + ShaderEval shader_eval(device, progress); + shader_eval.eval( + SHADER_EVAL_BACKGROUND, + size, + [&](device_vector<KernelShaderEvalInput> &d_input) { + /* Fill coordinates for shading. */ + KernelShaderEvalInput *d_input_data = d_input.data(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + float u = (x + 0.5f) / width; + float v = (y + 0.5f) / height; + + KernelShaderEvalInput in; + in.object = OBJECT_NONE; + in.prim = PRIM_NONE; + in.u = u; + in.v = v; + d_input_data[x + y * width] = in; + } + } - d_output.free(); + return size; + }, + [&](device_vector<float4> &d_output) { + /* Copy output to pixel buffer. */ + float4 *d_output_data = d_output.data(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + pixels[y * width + x].x = d_output_data[y * width + x].x; + pixels[y * width + x].y = d_output_data[y * width + x].y; + pixels[y * width + x].z = d_output_data[y * width + x].z; + } + } + }); } /* Light */ @@ -140,15 +129,16 @@ NODE_DEFINE(Light) SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true); SOCKET_BOOLEAN(use_mis, "Use Mis", false); + SOCKET_BOOLEAN(use_camera, "Use Camera", true); SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true); SOCKET_BOOLEAN(use_glossy, "Use Glossy", true); SOCKET_BOOLEAN(use_transmission, "Use Transmission", true); SOCKET_BOOLEAN(use_scatter, "Use Scatter", true); - SOCKET_INT(samples, "Samples", 1); SOCKET_INT(max_bounces, "Max Bounces", 1024); SOCKET_UINT(random_id, "Random ID", 0); + SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", true); SOCKET_BOOLEAN(is_portal, "Is Portal", false); SOCKET_BOOLEAN(is_enabled, "Is Enabled", true); @@ -166,10 +156,6 @@ void Light::tag_update(Scene *scene) { if (is_modified()) { scene->light_manager->tag_update(scene, LightManager::LIGHT_MODIFIED); - - if (samples_is_modified()) { - scene->integrator->tag_update(scene, Integrator::LIGHT_SAMPLES_MODIFIED); - } } } @@ -193,7 +179,6 @@ LightManager::LightManager() { update_flags = UPDATE_ALL; need_update_background = true; - use_light_visibility = false; last_background_enabled = false; last_background_resolution = 0; } @@ -357,21 +342,23 @@ void LightManager::device_update_distribution(Device *, int object_id = j; int shader_flag = 0; + if (!(object->get_visibility() & PATH_RAY_CAMERA)) { + shader_flag |= SHADER_EXCLUDE_CAMERA; + } if (!(object->get_visibility() & PATH_RAY_DIFFUSE)) { shader_flag |= SHADER_EXCLUDE_DIFFUSE; - use_light_visibility = true; } if (!(object->get_visibility() & PATH_RAY_GLOSSY)) { shader_flag |= SHADER_EXCLUDE_GLOSSY; - use_light_visibility = true; } if (!(object->get_visibility() & PATH_RAY_TRANSMIT)) { shader_flag |= SHADER_EXCLUDE_TRANSMIT; - use_light_visibility = true; } if (!(object->get_visibility() & PATH_RAY_VOLUME_SCATTER)) { shader_flag |= SHADER_EXCLUDE_SCATTER; - use_light_visibility = true; + } + if (!(object->get_is_shadow_catcher())) { + shader_flag |= SHADER_EXCLUDE_SHADOW_CATCHER; } size_t mesh_num_triangles = mesh->num_triangles(); @@ -496,10 +483,10 @@ void LightManager::device_update_distribution(Device *, kfilm->pass_shadow_scale = 1.0f; if (kintegrator->pdf_triangles != 0.0f) - kfilm->pass_shadow_scale *= 0.5f; + kfilm->pass_shadow_scale /= 0.5f; if (num_background_lights < num_lights) - kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights) / (float)num_lights; + kfilm->pass_shadow_scale /= (float)(num_lights - num_background_lights) / (float)num_lights; /* CDF */ dscene->light_distribution.copy_to_device(); @@ -766,25 +753,26 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc if (!light->cast_shadow) shader_id &= ~SHADER_CAST_SHADOW; + if (!light->use_camera) { + shader_id |= SHADER_EXCLUDE_CAMERA; + } if (!light->use_diffuse) { shader_id |= SHADER_EXCLUDE_DIFFUSE; - use_light_visibility = true; } if (!light->use_glossy) { shader_id |= SHADER_EXCLUDE_GLOSSY; - use_light_visibility = true; } if (!light->use_transmission) { shader_id |= SHADER_EXCLUDE_TRANSMIT; - use_light_visibility = true; } if (!light->use_scatter) { shader_id |= SHADER_EXCLUDE_SCATTER; - use_light_visibility = true; + } + if (!light->is_shadow_catcher) { + shader_id |= SHADER_EXCLUDE_SHADOW_CATCHER; } klights[light_index].type = light->light_type; - klights[light_index].samples = light->samples; klights[light_index].strength[0] = light->strength.x; klights[light_index].strength[1] = light->strength.y; klights[light_index].strength[2] = light->strength.z; @@ -836,19 +824,15 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc if (!(visibility & PATH_RAY_DIFFUSE)) { shader_id |= SHADER_EXCLUDE_DIFFUSE; - use_light_visibility = true; } if (!(visibility & PATH_RAY_GLOSSY)) { shader_id |= SHADER_EXCLUDE_GLOSSY; - use_light_visibility = true; } if (!(visibility & PATH_RAY_TRANSMIT)) { shader_id |= SHADER_EXCLUDE_TRANSMIT; - use_light_visibility = true; } if (!(visibility & PATH_RAY_VOLUME_SCATTER)) { shader_id |= SHADER_EXCLUDE_SCATTER; - use_light_visibility = true; } } else if (light->light_type == LIGHT_AREA) { @@ -998,8 +982,6 @@ void LightManager::device_update(Device *device, device_free(device, dscene, need_update_background); - use_light_visibility = false; - device_update_points(device, dscene, scene); if (progress.get_cancel()) return; @@ -1018,8 +1000,6 @@ void LightManager::device_update(Device *device, if (progress.get_cancel()) return; - scene->film->set_use_light_visibility(use_light_visibility); - update_flags = UPDATE_NONE; need_update_background = false; } diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h index fbd709125ff..7f86237c8b3 100644 --- a/intern/cycles/render/light.h +++ b/intern/cycles/render/light.h @@ -69,16 +69,17 @@ class Light : public Node { NODE_SOCKET_API(bool, cast_shadow) NODE_SOCKET_API(bool, use_mis) + NODE_SOCKET_API(bool, use_camera) NODE_SOCKET_API(bool, use_diffuse) NODE_SOCKET_API(bool, use_glossy) NODE_SOCKET_API(bool, use_transmission) NODE_SOCKET_API(bool, use_scatter) + NODE_SOCKET_API(bool, is_shadow_catcher) NODE_SOCKET_API(bool, is_portal) NODE_SOCKET_API(bool, is_enabled) NODE_SOCKET_API(Shader *, shader) - NODE_SOCKET_API(int, samples) NODE_SOCKET_API(int, max_bounces) NODE_SOCKET_API(uint, random_id) @@ -108,8 +109,6 @@ class LightManager { UPDATE_NONE = 0u, }; - bool use_light_visibility; - /* Need to update background (including multiple importance map) */ bool need_update_background; diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index b39d81023d9..c00c4c24211 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -16,6 +16,8 @@ #include "device/device.h" +#include "integrator/shader_eval.h" + #include "render/mesh.h" #include "render/object.h" #include "render/scene.h" @@ -43,40 +45,28 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts) return norm / normlen; } -bool GeometryManager::displace( - Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress) +/* Fill in coordinates for mesh displacement shader evaluation on device. */ +static int fill_shader_input(const Scene *scene, + const Mesh *mesh, + const int object_index, + device_vector<KernelShaderEvalInput> &d_input) { - /* verify if we have a displacement shader */ - if (!mesh->has_true_displacement()) { - return false; - } - - string msg = string_printf("Computing Displacement %s", mesh->name.c_str()); - progress.set_status("Updating Mesh", msg); + int d_input_size = 0; + KernelShaderEvalInput *d_input_data = d_input.data(); - /* find object index. todo: is arbitrary */ - size_t object_index = OBJECT_NONE; + const array<int> &mesh_shaders = mesh->get_shader(); + const array<Node *> &mesh_used_shaders = mesh->get_used_shaders(); + const array<float3> &mesh_verts = mesh->get_verts(); - for (size_t i = 0; i < scene->objects.size(); i++) { - if (scene->objects[i]->get_geometry() == mesh) { - object_index = i; - break; - } - } - - /* setup input for device task */ - const size_t num_verts = mesh->verts.size(); + const int num_verts = mesh_verts.size(); vector<bool> done(num_verts, false); - device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY); - uint4 *d_input_data = d_input.alloc(num_verts); - size_t d_input_size = 0; - size_t num_triangles = mesh->num_triangles(); - for (size_t i = 0; i < num_triangles; i++) { + int num_triangles = mesh->num_triangles(); + for (int i = 0; i < num_triangles; i++) { Mesh::Triangle t = mesh->get_triangle(i); - int shader_index = mesh->shader[i]; - Shader *shader = (shader_index < mesh->used_shaders.size()) ? - static_cast<Shader *>(mesh->used_shaders[shader_index]) : + int shader_index = mesh_shaders[i]; + Shader *shader = (shader_index < mesh_used_shaders.size()) ? + static_cast<Shader *>(mesh_used_shaders[shader_index]) : scene->default_surface; if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) { @@ -110,57 +100,41 @@ bool GeometryManager::displace( } /* back */ - uint4 in = make_uint4(object, prim, __float_as_int(u), __float_as_int(v)); + KernelShaderEvalInput in; + in.object = object; + in.prim = prim; + in.u = u; + in.v = v; d_input_data[d_input_size++] = in; } } - if (d_input_size == 0) - return false; - - /* run device task */ - device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE); - d_output.alloc(d_input_size); - d_output.zero_to_device(); - d_input.copy_to_device(); - - /* needs to be up to data for attribute access */ - device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - - DeviceTask task(DeviceTask::SHADER); - task.shader_input = d_input.device_pointer; - task.shader_output = d_output.device_pointer; - task.shader_eval_type = SHADER_EVAL_DISPLACE; - task.shader_x = 0; - task.shader_w = d_output.size(); - task.num_samples = 1; - task.get_cancel = function_bind(&Progress::get_cancel, &progress); - - device->task_add(task); - device->task_wait(); - - if (progress.get_cancel()) { - d_input.free(); - d_output.free(); - return false; - } + return d_input_size; +} - d_output.copy_from_device(0, 1, d_output.size()); - d_input.free(); +/* Read back mesh displacement shader output. */ +static void read_shader_output(const Scene *scene, + Mesh *mesh, + const device_vector<float4> &d_output) +{ + const array<int> &mesh_shaders = mesh->get_shader(); + const array<Node *> &mesh_used_shaders = mesh->get_used_shaders(); + array<float3> &mesh_verts = mesh->get_verts(); - /* read result */ - done.clear(); - done.resize(num_verts, false); - int k = 0; + const int num_verts = mesh_verts.size(); + const int num_motion_steps = mesh->get_motion_steps(); + vector<bool> done(num_verts, false); - float4 *offset = d_output.data(); + const float4 *d_output_data = d_output.data(); + int d_output_index = 0; Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - for (size_t i = 0; i < num_triangles; i++) { + int num_triangles = mesh->num_triangles(); + for (int i = 0; i < num_triangles; i++) { Mesh::Triangle t = mesh->get_triangle(i); - int shader_index = mesh->shader[i]; - Shader *shader = (shader_index < mesh->used_shaders.size()) ? - static_cast<Shader *>(mesh->used_shaders[shader_index]) : + int shader_index = mesh_shaders[i]; + Shader *shader = (shader_index < mesh_used_shaders.size()) ? + static_cast<Shader *>(mesh_used_shaders[shader_index]) : scene->default_surface; if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) { @@ -170,12 +144,12 @@ bool GeometryManager::displace( for (int j = 0; j < 3; j++) { if (!done[t.v[j]]) { done[t.v[j]] = true; - float3 off = float4_to_float3(offset[k++]); + float3 off = float4_to_float3(d_output_data[d_output_index++]); /* Avoid illegal vertex coordinates. */ off = ensure_finite3(off); - mesh->verts[t.v[j]] += off; + mesh_verts[t.v[j]] += off; if (attr_mP != NULL) { - for (int step = 0; step < mesh->motion_steps - 1; step++) { + for (int step = 0; step < num_motion_steps - 1; step++) { float3 *mP = attr_mP->data_float3() + step * num_verts; mP[t.v[j]] += off; } @@ -183,8 +157,47 @@ bool GeometryManager::displace( } } } +} - d_output.free(); +bool GeometryManager::displace( + Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress) +{ + /* verify if we have a displacement shader */ + if (!mesh->has_true_displacement()) { + return false; + } + + const size_t num_verts = mesh->verts.size(); + const size_t num_triangles = mesh->num_triangles(); + + if (num_triangles == 0) { + return false; + } + + string msg = string_printf("Computing Displacement %s", mesh->name.c_str()); + progress.set_status("Updating Mesh", msg); + + /* find object index. todo: is arbitrary */ + size_t object_index = OBJECT_NONE; + + for (size_t i = 0; i < scene->objects.size(); i++) { + if (scene->objects[i]->get_geometry() == mesh) { + object_index = i; + break; + } + } + + /* Needs to be up to data for attribute access. */ + device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); + + /* Evaluate shader on device. */ + ShaderEval shader_eval(device, progress); + if (!shader_eval.eval(SHADER_EVAL_DISPLACE, + num_verts, + function_bind(&fill_shader_input, scene, mesh, object_index, _1), + function_bind(&read_shader_output, scene, mesh, _1))) { + return false; + } /* stitch */ unordered_set<int> stitch_keys; @@ -297,8 +310,7 @@ bool GeometryManager::displace( } /* normalize vertex normals */ - done.clear(); - done.resize(num_verts, false); + vector<bool> done(num_verts, false); for (size_t i = 0; i < num_triangles; i++) { if (tri_has_true_disp[i]) { @@ -368,8 +380,7 @@ bool GeometryManager::displace( } /* normalize vertex normals */ - done.clear(); - done.resize(num_verts, false); + vector<bool> done(num_verts, false); for (size_t i = 0; i < num_triangles; i++) { if (tri_has_true_disp[i]) { diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 795166bcf4c..5303d55242e 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -2736,18 +2736,21 @@ NODE_DEFINE(PrincipledBsdfNode) distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID); static NodeEnum subsurface_method_enum; - subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID); - subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID); + subsurface_method_enum.insert("random_walk_fixed_radius", + CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID); + subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID); SOCKET_ENUM(subsurface_method, "Subsurface Method", subsurface_method_enum, - CLOSURE_BSSRDF_PRINCIPLED_ID); + CLOSURE_BSSRDF_RANDOM_WALK_ID); SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f)); SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f)); SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f); SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f); SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f)); + SOCKET_IN_FLOAT(subsurface_ior, "Subsurface IOR", 1.4f); + SOCKET_IN_FLOAT(subsurface_anisotropy, "Subsurface Anisotropy", 0.0f); SOCKET_IN_FLOAT(specular, "Specular", 0.0f); SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f); SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f); @@ -2857,6 +2860,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius, + ShaderInput *p_subsurface_ior, + ShaderInput *p_subsurface_anisotropy, ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, @@ -2896,6 +2901,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler, int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness); int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation); int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius); + int subsurface_ior_offset = compiler.stack_assign(p_subsurface_ior); + int subsurface_anisotropy_offset = compiler.stack_assign(p_subsurface_anisotropy); compiler.add_node(NODE_CLOSURE_BSDF, compiler.encode_uchar4(closure, @@ -2929,8 +2936,10 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler, __float_as_int(bc_default.y), __float_as_int(bc_default.z)); - compiler.add_node( - clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID); + compiler.add_node(clearcoat_normal_offset, + subsurface_radius_offset, + subsurface_ior_offset, + subsurface_anisotropy_offset); float3 ss_default = get_float3(subsurface_color_in->socket_type); @@ -2953,6 +2962,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler) input("Metallic"), input("Subsurface"), input("Subsurface Radius"), + input("Subsurface IOR"), + input("Subsurface Anisotropy"), input("Specular"), input("Roughness"), input("Specular Tint"), @@ -3048,16 +3059,16 @@ NODE_DEFINE(SubsurfaceScatteringNode) SOCKET_IN_NORMAL(normal, "Normal", zero_float3(), SocketType::LINK_NORMAL); SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL); - static NodeEnum falloff_enum; - falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID); - falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID); - falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID); - falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID); - SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID); + static NodeEnum method_enum; + method_enum.insert("random_walk_fixed_radius", CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID); + method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID); + SOCKET_ENUM(method, "Method", method_enum, CLOSURE_BSSRDF_RANDOM_WALK_ID); + SOCKET_IN_FLOAT(scale, "Scale", 0.01f); SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f)); - SOCKET_IN_FLOAT(sharpness, "Sharpness", 0.0f); - SOCKET_IN_FLOAT(texture_blur, "Texture Blur", 1.0f); + + SOCKET_IN_FLOAT(subsurface_ior, "IOR", 1.4f); + SOCKET_IN_FLOAT(subsurface_anisotropy, "Anisotropy", 0.0f); SOCKET_OUT_CLOSURE(BSSRDF, "BSSRDF"); @@ -3066,20 +3077,19 @@ NODE_DEFINE(SubsurfaceScatteringNode) SubsurfaceScatteringNode::SubsurfaceScatteringNode() : BsdfNode(get_node_type()) { - closure = falloff; + closure = method; } void SubsurfaceScatteringNode::compile(SVMCompiler &compiler) { - closure = falloff; - BsdfNode::compile( - compiler, input("Scale"), input("Texture Blur"), input("Radius"), input("Sharpness")); + closure = method; + BsdfNode::compile(compiler, input("Scale"), input("IOR"), input("Radius"), input("Anisotropy")); } void SubsurfaceScatteringNode::compile(OSLCompiler &compiler) { - closure = falloff; - compiler.parameter(this, "falloff"); + closure = method; + compiler.parameter(this, "method"); compiler.add(this, "node_subsurface_scattering"); } @@ -3786,20 +3796,6 @@ void GeometryNode::compile(OSLCompiler &compiler) compiler.add(this, "node_geometry"); } -int GeometryNode::get_group() -{ - ShaderOutput *out; - int result = ShaderNode::get_group(); - - /* Backfacing uses NODE_LIGHT_PATH */ - out = output("Backfacing"); - if (!out->links.empty()) { - result = max(result, NODE_GROUP_LEVEL_1); - } - - return result; -} - /* TextureCoordinate */ NODE_DEFINE(TextureCoordinateNode) @@ -5926,33 +5922,33 @@ NODE_DEFINE(OutputAOVNode) OutputAOVNode::OutputAOVNode() : ShaderNode(get_node_type()) { special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV; - slot = -1; + offset = -1; } void OutputAOVNode::simplify_settings(Scene *scene) { - slot = scene->film->get_aov_offset(scene, name.string(), is_color); - if (slot == -1) { - slot = scene->film->get_aov_offset(scene, name.string(), is_color); + offset = scene->film->get_aov_offset(scene, name.string(), is_color); + if (offset == -1) { + offset = scene->film->get_aov_offset(scene, name.string(), is_color); } - if (slot == -1 || is_color) { + if (offset == -1 || is_color) { input("Value")->disconnect(); } - if (slot == -1 || !is_color) { + if (offset == -1 || !is_color) { input("Color")->disconnect(); } } void OutputAOVNode::compile(SVMCompiler &compiler) { - assert(slot >= 0); + assert(offset >= 0); if (is_color) { - compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot); + compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), offset); } else { - compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot); + compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), offset); } } diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index 3013e9b1866..22bdb06b059 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -143,10 +143,6 @@ class EnvironmentTextureNode : public ImageSlotTextureNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } virtual bool equals(const ShaderNode &other) { @@ -170,11 +166,6 @@ class SkyTextureNode : public TextureNode { public: SHADER_NODE_CLASS(SkyTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - NODE_SOCKET_API(NodeSkyType, sky_type) NODE_SOCKET_API(float3, sun_direction) NODE_SOCKET_API(float, turbidity) @@ -224,18 +215,13 @@ class OutputAOVNode : public ShaderNode { NODE_SOCKET_API(ustring, name) - virtual int get_group() - { - return NODE_GROUP_LEVEL_4; - } - /* Don't allow output node de-duplication. */ virtual bool equals(const ShaderNode & /*other*/) { return false; } - int slot; + int offset; bool is_color; }; @@ -243,11 +229,6 @@ class GradientTextureNode : public TextureNode { public: SHADER_NODE_CLASS(GradientTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - NODE_SOCKET_API(NodeGradientType, gradient_type) NODE_SOCKET_API(float3, vector) }; @@ -269,19 +250,14 @@ class VoronoiTextureNode : public TextureNode { public: SHADER_NODE_CLASS(VoronoiTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - virtual int get_feature() { int result = ShaderNode::get_feature(); if (dimensions == 4) { - result |= NODE_FEATURE_VORONOI_EXTRA; + result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA; } else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) { - result |= NODE_FEATURE_VORONOI_EXTRA; + result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA; } return result; } @@ -301,11 +277,6 @@ class MusgraveTextureNode : public TextureNode { public: SHADER_NODE_CLASS(MusgraveTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - NODE_SOCKET_API(int, dimensions) NODE_SOCKET_API(NodeMusgraveType, musgrave_type) NODE_SOCKET_API(float, w) @@ -322,11 +293,6 @@ class WaveTextureNode : public TextureNode { public: SHADER_NODE_CLASS(WaveTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - NODE_SOCKET_API(NodeWaveType, wave_type) NODE_SOCKET_API(NodeWaveBandsDirection, bands_direction) NODE_SOCKET_API(NodeWaveRingsDirection, rings_direction) @@ -345,11 +311,6 @@ class MagicTextureNode : public TextureNode { public: SHADER_NODE_CLASS(MagicTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } - NODE_SOCKET_API(int, depth) NODE_SOCKET_API(float3, vector) NODE_SOCKET_API(float, scale) @@ -364,11 +325,6 @@ class CheckerTextureNode : public TextureNode { NODE_SOCKET_API(float3, color1) NODE_SOCKET_API(float3, color2) NODE_SOCKET_API(float, scale) - - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } }; class BrickTextureNode : public TextureNode { @@ -390,20 +346,11 @@ class BrickTextureNode : public TextureNode { NODE_SOCKET_API(float, brick_width) NODE_SOCKET_API(float, row_height) NODE_SOCKET_API(float3, vector) - - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } }; class PointDensityTextureNode : public ShaderNode { public: SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_4; - } ~PointDensityTextureNode(); ShaderNode *clone(ShaderGraph *graph) const; @@ -443,10 +390,6 @@ class IESLightNode : public TextureNode { ~IESLightNode(); ShaderNode *clone(ShaderGraph *graph) const; - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } NODE_SOCKET_API(ustring, filename) NODE_SOCKET_API(ustring, ies) @@ -464,10 +407,6 @@ class IESLightNode : public TextureNode { class WhiteNoiseTextureNode : public ShaderNode { public: SHADER_NODE_CLASS(WhiteNoiseTextureNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } NODE_SOCKET_API(int, dimensions) NODE_SOCKET_API(float3, vector) @@ -477,10 +416,6 @@ class WhiteNoiseTextureNode : public ShaderNode { class MappingNode : public ShaderNode { public: SHADER_NODE_CLASS(MappingNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } void constant_fold(const ConstantFolder &folder); NODE_SOCKET_API(float3, vector) @@ -546,6 +481,11 @@ class BsdfBaseNode : public ShaderNode { return false; } + virtual int get_feature() + { + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_BSDF; + } + protected: ClosureType closure; }; @@ -606,6 +546,8 @@ class PrincipledBsdfNode : public BsdfBaseNode { ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius, + ShaderInput *subsurface_ior, + ShaderInput *subsurface_anisotropy, ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, @@ -622,6 +564,8 @@ class PrincipledBsdfNode : public BsdfBaseNode { NODE_SOCKET_API(float3, base_color) NODE_SOCKET_API(float3, subsurface_color) NODE_SOCKET_API(float3, subsurface_radius) + NODE_SOCKET_API(float, subsurface_ior) + NODE_SOCKET_API(float, subsurface_anisotropy) NODE_SOCKET_API(float, metallic) NODE_SOCKET_API(float, subsurface) NODE_SOCKET_API(float, specular) @@ -758,14 +702,14 @@ class SubsurfaceScatteringNode : public BsdfNode { bool has_bssrdf_bump(); ClosureType get_closure_type() { - return falloff; + return method; } NODE_SOCKET_API(float, scale) NODE_SOCKET_API(float3, radius) - NODE_SOCKET_API(float, sharpness) - NODE_SOCKET_API(float, texture_blur) - NODE_SOCKET_API(ClosureType, falloff) + NODE_SOCKET_API(float, subsurface_ior) + NODE_SOCKET_API(float, subsurface_anisotropy) + NODE_SOCKET_API(ClosureType, method) }; class EmissionNode : public ShaderNode { @@ -782,6 +726,11 @@ class EmissionNode : public ShaderNode { return true; } + virtual int get_feature() + { + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION; + } + NODE_SOCKET_API(float3, color) NODE_SOCKET_API(float, strength) NODE_SOCKET_API(float, surface_mix_weight) @@ -792,6 +741,11 @@ class BackgroundNode : public ShaderNode { SHADER_NODE_CLASS(BackgroundNode) void constant_fold(const ConstantFolder &folder); + virtual int get_feature() + { + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION; + } + NODE_SOCKET_API(float3, color) NODE_SOCKET_API(float, strength) NODE_SOCKET_API(float, surface_mix_weight) @@ -800,10 +754,6 @@ class BackgroundNode : public ShaderNode { class HoldoutNode : public ShaderNode { public: SHADER_NODE_CLASS(HoldoutNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } virtual ClosureType get_closure_type() { return CLOSURE_HOLDOUT_ID; @@ -821,13 +771,9 @@ class AmbientOcclusionNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } - virtual bool has_raytrace() + virtual int get_feature() { - return true; + return KERNEL_FEATURE_NODE_RAYTRACE; } NODE_SOCKET_API(float3, color) @@ -845,13 +791,9 @@ class VolumeNode : public ShaderNode { SHADER_NODE_BASE_CLASS(VolumeNode) void compile(SVMCompiler &compiler, ShaderInput *param1, ShaderInput *param2); - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } virtual int get_feature() { - return ShaderNode::get_feature() | NODE_FEATURE_VOLUME; + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_VOLUME; } virtual ClosureType get_closure_type() { @@ -1013,10 +955,6 @@ class UVMapNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API(ustring, attribute) NODE_SOCKET_API(bool, from_dupli) @@ -1025,10 +963,6 @@ class UVMapNode : public ShaderNode { class LightPathNode : public ShaderNode { public: SHADER_NODE_CLASS(LightPathNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } }; class LightFalloffNode : public ShaderNode { @@ -1038,10 +972,6 @@ class LightFalloffNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } NODE_SOCKET_API(float, strength) NODE_SOCKET_API(float, smooth) @@ -1050,10 +980,6 @@ class LightFalloffNode : public ShaderNode { class ObjectInfoNode : public ShaderNode { public: SHADER_NODE_CLASS(ObjectInfoNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } }; class ParticleInfoNode : public ShaderNode { @@ -1064,10 +990,6 @@ class ParticleInfoNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } }; class HairInfoNode : public ShaderNode { @@ -1083,13 +1005,9 @@ class HairInfoNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } virtual int get_feature() { - return ShaderNode::get_feature() | NODE_FEATURE_HAIR; + return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR; } }; @@ -1168,10 +1086,6 @@ class InvertNode : public ShaderNode { public: SHADER_NODE_CLASS(InvertNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, fac) NODE_SOCKET_API(float3, color) @@ -1182,11 +1096,6 @@ class MixNode : public ShaderNode { SHADER_NODE_CLASS(MixNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } - NODE_SOCKET_API(NodeMix, mix_type) NODE_SOCKET_API(bool, use_clamp) NODE_SOCKET_API(float3, color1) @@ -1198,10 +1107,6 @@ class CombineRGBNode : public ShaderNode { public: SHADER_NODE_CLASS(CombineRGBNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, r) NODE_SOCKET_API(float, g) @@ -1212,10 +1117,6 @@ class CombineHSVNode : public ShaderNode { public: SHADER_NODE_CLASS(CombineHSVNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, h) NODE_SOCKET_API(float, s) @@ -1226,10 +1127,6 @@ class CombineXYZNode : public ShaderNode { public: SHADER_NODE_CLASS(CombineXYZNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, x) NODE_SOCKET_API(float, y) @@ -1240,10 +1137,6 @@ class GammaNode : public ShaderNode { public: SHADER_NODE_CLASS(GammaNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API(float3, color) NODE_SOCKET_API(float, gamma) @@ -1253,10 +1146,6 @@ class BrightContrastNode : public ShaderNode { public: SHADER_NODE_CLASS(BrightContrastNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API(float3, color) NODE_SOCKET_API(float, bright) @@ -1267,10 +1156,6 @@ class SeparateRGBNode : public ShaderNode { public: SHADER_NODE_CLASS(SeparateRGBNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float3, color) }; @@ -1279,10 +1164,6 @@ class SeparateHSVNode : public ShaderNode { public: SHADER_NODE_CLASS(SeparateHSVNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float3, color) }; @@ -1291,10 +1172,6 @@ class SeparateXYZNode : public ShaderNode { public: SHADER_NODE_CLASS(SeparateXYZNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float3, vector) }; @@ -1333,10 +1210,6 @@ class CameraNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } }; class FresnelNode : public ShaderNode { @@ -1346,10 +1219,6 @@ class FresnelNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API(float3, normal) NODE_SOCKET_API(float, IOR) @@ -1362,10 +1231,6 @@ class LayerWeightNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API(float3, normal) NODE_SOCKET_API(float, blend) @@ -1378,10 +1243,6 @@ class WireframeNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, size) NODE_SOCKET_API(bool, use_pixel_size) @@ -1390,10 +1251,6 @@ class WireframeNode : public ShaderNode { class WavelengthNode : public ShaderNode { public: SHADER_NODE_CLASS(WavelengthNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, wavelength) }; @@ -1402,10 +1259,6 @@ class BlackbodyNode : public ShaderNode { public: SHADER_NODE_CLASS(BlackbodyNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, temperature) }; @@ -1413,10 +1266,6 @@ class BlackbodyNode : public ShaderNode { class MapRangeNode : public ShaderNode { public: SHADER_NODE_CLASS(MapRangeNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } void expand(ShaderGraph *graph); NODE_SOCKET_API(float, value) @@ -1433,10 +1282,6 @@ class ClampNode : public ShaderNode { public: SHADER_NODE_CLASS(ClampNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(float, value) NODE_SOCKET_API(float, min) NODE_SOCKET_API(float, max) @@ -1446,10 +1291,6 @@ class ClampNode : public ShaderNode { class MathNode : public ShaderNode { public: SHADER_NODE_CLASS(MathNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } void expand(ShaderGraph *graph); void constant_fold(const ConstantFolder &folder); @@ -1463,10 +1304,6 @@ class MathNode : public ShaderNode { class NormalNode : public ShaderNode { public: SHADER_NODE_CLASS(NormalNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_2; - } NODE_SOCKET_API(float3, direction) NODE_SOCKET_API(float3, normal) @@ -1475,10 +1312,6 @@ class NormalNode : public ShaderNode { class VectorMathNode : public ShaderNode { public: SHADER_NODE_CLASS(VectorMathNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } void constant_fold(const ConstantFolder &folder); NODE_SOCKET_API(float3, vector1) @@ -1492,10 +1325,6 @@ class VectorRotateNode : public ShaderNode { public: SHADER_NODE_CLASS(VectorRotateNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(NodeVectorRotateType, rotate_type) NODE_SOCKET_API(bool, invert) NODE_SOCKET_API(float3, vector) @@ -1509,11 +1338,6 @@ class VectorTransformNode : public ShaderNode { public: SHADER_NODE_CLASS(VectorTransformNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } - NODE_SOCKET_API(NodeVectorTransformType, transform_type) NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_from) NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_to) @@ -1530,7 +1354,7 @@ class BumpNode : public ShaderNode { } virtual int get_feature() { - return NODE_FEATURE_BUMP; + return KERNEL_FEATURE_NODE_BUMP; } NODE_SOCKET_API(bool, invert) @@ -1549,11 +1373,6 @@ class CurvesNode : public ShaderNode { explicit CurvesNode(const NodeType *node_type); SHADER_NODE_BASE_CLASS(CurvesNode) - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } - NODE_SOCKET_API_ARRAY(array<float3>, curves) NODE_SOCKET_API(float, min_x) NODE_SOCKET_API(float, max_x) @@ -1583,10 +1402,6 @@ class RGBRampNode : public ShaderNode { public: SHADER_NODE_CLASS(RGBRampNode) void constant_fold(const ConstantFolder &folder); - virtual int get_group() - { - return NODE_GROUP_LEVEL_1; - } NODE_SOCKET_API_ARRAY(array<float3>, ramp) NODE_SOCKET_API_ARRAY(array<float>, ramp_alpha) @@ -1656,10 +1471,6 @@ class NormalMapNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(NodeNormalMapSpace, space) NODE_SOCKET_API(ustring, attribute) @@ -1680,10 +1491,6 @@ class TangentNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } NODE_SOCKET_API(NodeTangentDirectionType, direction_type) NODE_SOCKET_API(NodeTangentAxis, axis) @@ -1698,13 +1505,9 @@ class BevelNode : public ShaderNode { { return true; } - virtual int get_group() - { - return NODE_GROUP_LEVEL_3; - } - virtual bool has_raytrace() + virtual int get_feature() { - return true; + return KERNEL_FEATURE_NODE_RAYTRACE; } NODE_SOCKET_API(float, radius) @@ -1718,7 +1521,7 @@ class DisplacementNode : public ShaderNode { void constant_fold(const ConstantFolder &folder); virtual int get_feature() { - return NODE_FEATURE_BUMP; + return KERNEL_FEATURE_NODE_BUMP; } NODE_SOCKET_API(NodeNormalMapSpace, space) @@ -1739,7 +1542,7 @@ class VectorDisplacementNode : public ShaderNode { void constant_fold(const ConstantFolder &folder); virtual int get_feature() { - return NODE_FEATURE_BUMP; + return KERNEL_FEATURE_NODE_BUMP; } NODE_SOCKET_API(NodeNormalMapSpace, space) diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index c88d94fe4c2..4637f8fe989 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -216,6 +216,10 @@ void Object::tag_update(Scene *scene) if (use_holdout_is_modified()) { flag |= ObjectManager::HOLDOUT_MODIFIED; } + + if (is_shadow_catcher_is_modified()) { + scene->tag_shadow_catcher_modified(); + } } if (geometry) { @@ -273,14 +277,7 @@ bool Object::is_traceable() const uint Object::visibility_for_tracing() const { - uint trace_visibility = visibility; - if (is_shadow_catcher) { - trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER; - } - else { - trace_visibility &= ~PATH_RAY_SHADOW_CATCHER; - } - return trace_visibility; + return SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility & PATH_RAY_ALL_VISIBILITY); } float Object::compute_volume_step_size() const @@ -680,7 +677,7 @@ void ObjectManager::device_update(Device *device, /* prepare for static BVH building */ /* todo: do before to support getting object level coords? */ - if (scene->params.bvh_type == SceneParams::BVH_STATIC) { + if (scene->params.bvh_type == BVH_TYPE_STATIC) { scoped_callback_timer timer([scene](double time) { if (scene->update_stats) { scene->update_stats->object.times.add_entry( @@ -932,6 +929,11 @@ void ObjectManager::tag_update(Scene *scene, uint32_t flag) } scene->light_manager->tag_update(scene, LightManager::OBJECT_MANAGER); + + /* Integrator's shadow catcher settings depends on object visibility settings. */ + if (flag & (OBJECT_ADDED | OBJECT_REMOVED | OBJECT_MODIFIED)) { + scene->integrator->tag_update(scene, Integrator::OBJECT_MANAGER); + } } bool ObjectManager::need_update() const diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 7dc79f48145..d28b222c10e 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -113,7 +113,7 @@ void OSLShaderManager::device_update_specific(Device *device, scene->image_manager->set_osl_texture_system((void *)ts); /* create shaders */ - OSLGlobals *og = (OSLGlobals *)device->osl_memory(); + OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); Shader *background_shader = scene->background->get_shader(scene); foreach (Shader *shader, scene->shaders) { @@ -174,7 +174,7 @@ void OSLShaderManager::device_update_specific(Device *device, void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene) { - OSLGlobals *og = (OSLGlobals *)device->osl_memory(); + OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory(); device_free_common(device, dscene, scene); @@ -257,25 +257,36 @@ void OSLShaderManager::shading_system_init() /* our own ray types */ static const char *raytypes[] = { - "camera", /* PATH_RAY_CAMERA */ - "reflection", /* PATH_RAY_REFLECT */ - "refraction", /* PATH_RAY_TRANSMIT */ - "diffuse", /* PATH_RAY_DIFFUSE */ - "glossy", /* PATH_RAY_GLOSSY */ - "singular", /* PATH_RAY_SINGULAR */ - "transparent", /* PATH_RAY_TRANSPARENT */ - - "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */ - "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */ - "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */ - "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */ - - "__unused__", "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ - "__unused__", - - "__unused__", "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */ - "__unused__", "__unused__", "__unused__", "__unused__", - "__unused__", "__unused__", "__unused__", + "camera", /* PATH_RAY_CAMERA */ + "reflection", /* PATH_RAY_REFLECT */ + "refraction", /* PATH_RAY_TRANSMIT */ + "diffuse", /* PATH_RAY_DIFFUSE */ + "glossy", /* PATH_RAY_GLOSSY */ + "singular", /* PATH_RAY_SINGULAR */ + "transparent", /* PATH_RAY_TRANSPARENT */ + "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ + + "shadow", /* PATH_RAY_SHADOW_OPAQUE */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */ + + "__unused__", /* PATH_RAY_NODE_UNALIGNED */ + "__unused__", /* PATH_RAY_MIS_SKIP */ + + "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */ + + "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */ + "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */ + "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */ + "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */ + "__unused__", /* PATH_RAY_EMISSION */ + "__unused__", /* PATH_RAY_SUBSURFACE */ + "__unused__", /* PATH_RAY_DENOISING_FEATURES */ + "__unused__", /* PATH_RAY_REFLECT_PASS */ + "__unused__", /* PATH_RAY_TRANSMISSION_PASS */ + "__unused__", /* PATH_RAY_VOLUME_PASS */ + "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */ + "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */ + "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */ }; const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]); @@ -758,7 +769,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath) current_shader->has_surface_bssrdf = true; current_shader->has_bssrdf_bump = true; /* can't detect yet */ } - current_shader->has_bump = true; /* can't detect yet */ + current_shader->has_bump = true; /* can't detect yet */ + current_shader->has_surface_raytrace = true; /* can't detect yet */ } if (node->has_spatial_varying()) { @@ -1054,6 +1066,8 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes) current_shader->has_surface_emission = true; if (node->has_surface_transparent()) current_shader->has_surface_transparent = true; + if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE) + current_shader->has_surface_raytrace = true; if (node->has_spatial_varying()) current_shader->has_surface_spatial_varying = true; if (node->has_surface_bssrdf()) { diff --git a/intern/cycles/render/pass.cpp b/intern/cycles/render/pass.cpp new file mode 100644 index 00000000000..27ad7c0db97 --- /dev/null +++ b/intern/cycles/render/pass.cpp @@ -0,0 +1,427 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/pass.h" + +#include "util/util_algorithm.h" +#include "util/util_logging.h" + +CCL_NAMESPACE_BEGIN + +const char *pass_type_as_string(const PassType type) +{ + const int type_int = static_cast<int>(type); + + const NodeEnum *type_enum = Pass::get_type_enum(); + + if (!type_enum->exists(type_int)) { + LOG(DFATAL) << "Unhandled pass type " << static_cast<int>(type) << ", not supposed to happen."; + return "UNKNOWN"; + } + + return (*type_enum)[type_int].c_str(); +} + +const char *pass_mode_as_string(PassMode mode) +{ + switch (mode) { + case PassMode::NOISY: + return "NOISY"; + case PassMode::DENOISED: + return "DENOISED"; + } + + LOG(DFATAL) << "Unhandled pass mode " << static_cast<int>(mode) << ", should never happen."; + return "UNKNOWN"; +} + +std::ostream &operator<<(std::ostream &os, PassMode mode) +{ + os << pass_mode_as_string(mode); + return os; +} + +const NodeEnum *Pass::get_type_enum() +{ + static NodeEnum pass_type_enum; + + if (pass_type_enum.empty()) { + + /* Light Passes. */ + pass_type_enum.insert("combined", PASS_COMBINED); + pass_type_enum.insert("emission", PASS_EMISSION); + pass_type_enum.insert("background", PASS_BACKGROUND); + pass_type_enum.insert("ao", PASS_AO); + pass_type_enum.insert("shadow", PASS_SHADOW); + pass_type_enum.insert("diffuse", PASS_DIFFUSE); + pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT); + pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT); + pass_type_enum.insert("glossy", PASS_GLOSSY); + pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT); + pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT); + pass_type_enum.insert("transmission", PASS_TRANSMISSION); + pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT); + pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT); + pass_type_enum.insert("volume", PASS_VOLUME); + pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT); + pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT); + + /* Data passes. */ + pass_type_enum.insert("depth", PASS_DEPTH); + pass_type_enum.insert("position", PASS_POSITION); + pass_type_enum.insert("normal", PASS_NORMAL); + pass_type_enum.insert("roughness", PASS_ROUGHNESS); + pass_type_enum.insert("uv", PASS_UV); + pass_type_enum.insert("object_id", PASS_OBJECT_ID); + pass_type_enum.insert("material_id", PASS_MATERIAL_ID); + pass_type_enum.insert("motion", PASS_MOTION); + pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT); + pass_type_enum.insert("render_time", PASS_RENDER_TIME); + pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE); + pass_type_enum.insert("aov_color", PASS_AOV_COLOR); + pass_type_enum.insert("aov_value", PASS_AOV_VALUE); + pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER); + pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT); + pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR); + pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR); + pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR); + pass_type_enum.insert("mist", PASS_MIST); + pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL); + pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO); + + pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER); + pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT); + pass_type_enum.insert("shadow_catcher_matte", PASS_SHADOW_CATCHER_MATTE); + + pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE); + pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL); + } + + return &pass_type_enum; +} + +const NodeEnum *Pass::get_mode_enum() +{ + static NodeEnum pass_mode_enum; + + if (pass_mode_enum.empty()) { + pass_mode_enum.insert("noisy", static_cast<int>(PassMode::NOISY)); + pass_mode_enum.insert("denoised", static_cast<int>(PassMode::DENOISED)); + } + + return &pass_mode_enum; +} + +NODE_DEFINE(Pass) +{ + NodeType *type = NodeType::add("pass", create); + + const NodeEnum *pass_type_enum = get_type_enum(); + const NodeEnum *pass_mode_enum = get_mode_enum(); + + SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED); + SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED)); + SOCKET_STRING(name, "Name", ustring()); + SOCKET_BOOLEAN(include_albedo, "Include Albedo", false); + + return type; +} + +Pass::Pass() : Node(get_node_type()), is_auto_(false) +{ +} + +PassInfo Pass::get_info() const +{ + return get_info(type, include_albedo); +} + +bool Pass::is_written() const +{ + return get_info().is_written; +} + +PassInfo Pass::get_info(const PassType type, const bool include_albedo) +{ + PassInfo pass_info; + + pass_info.use_filter = true; + pass_info.use_exposure = false; + pass_info.divide_type = PASS_NONE; + pass_info.use_compositing = false; + pass_info.use_denoising_albedo = true; + + switch (type) { + case PASS_NONE: + pass_info.num_components = 0; + break; + case PASS_COMBINED: + pass_info.num_components = 4; + pass_info.use_exposure = true; + pass_info.support_denoise = true; + break; + case PASS_DEPTH: + pass_info.num_components = 1; + pass_info.use_filter = false; + break; + case PASS_MIST: + pass_info.num_components = 1; + break; + case PASS_POSITION: + pass_info.num_components = 3; + break; + case PASS_NORMAL: + pass_info.num_components = 3; + break; + case PASS_ROUGHNESS: + pass_info.num_components = 1; + break; + case PASS_UV: + pass_info.num_components = 3; + break; + case PASS_MOTION: + pass_info.num_components = 4; + pass_info.divide_type = PASS_MOTION_WEIGHT; + break; + case PASS_MOTION_WEIGHT: + pass_info.num_components = 1; + break; + case PASS_OBJECT_ID: + case PASS_MATERIAL_ID: + pass_info.num_components = 1; + pass_info.use_filter = false; + break; + + case PASS_EMISSION: + case PASS_BACKGROUND: + pass_info.num_components = 3; + pass_info.use_exposure = true; + break; + case PASS_AO: + pass_info.num_components = 3; + break; + case PASS_SHADOW: + pass_info.num_components = 3; + pass_info.use_exposure = false; + break; + case PASS_RENDER_TIME: + /* This pass is handled entirely on the host side. */ + pass_info.num_components = 0; + break; + + case PASS_DIFFUSE_COLOR: + case PASS_GLOSSY_COLOR: + case PASS_TRANSMISSION_COLOR: + pass_info.num_components = 3; + break; + case PASS_DIFFUSE: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.direct_type = PASS_DIFFUSE_DIRECT; + pass_info.indirect_type = PASS_DIFFUSE_INDIRECT; + pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE; + pass_info.use_compositing = true; + pass_info.is_written = false; + break; + case PASS_DIFFUSE_DIRECT: + case PASS_DIFFUSE_INDIRECT: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE; + pass_info.use_compositing = true; + break; + case PASS_GLOSSY: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.direct_type = PASS_GLOSSY_DIRECT; + pass_info.indirect_type = PASS_GLOSSY_INDIRECT; + pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE; + pass_info.use_compositing = true; + pass_info.is_written = false; + break; + case PASS_GLOSSY_DIRECT: + case PASS_GLOSSY_INDIRECT: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE; + pass_info.use_compositing = true; + break; + case PASS_TRANSMISSION: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.direct_type = PASS_TRANSMISSION_DIRECT; + pass_info.indirect_type = PASS_TRANSMISSION_INDIRECT; + pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE; + pass_info.use_compositing = true; + pass_info.is_written = false; + break; + case PASS_TRANSMISSION_DIRECT: + case PASS_TRANSMISSION_INDIRECT: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE; + pass_info.use_compositing = true; + break; + case PASS_VOLUME: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.direct_type = PASS_VOLUME_DIRECT; + pass_info.indirect_type = PASS_VOLUME_INDIRECT; + pass_info.use_compositing = true; + pass_info.is_written = false; + break; + case PASS_VOLUME_DIRECT: + case PASS_VOLUME_INDIRECT: + pass_info.num_components = 3; + pass_info.use_exposure = true; + break; + + case PASS_CRYPTOMATTE: + pass_info.num_components = 4; + break; + + case PASS_DENOISING_NORMAL: + pass_info.num_components = 3; + break; + case PASS_DENOISING_ALBEDO: + pass_info.num_components = 3; + break; + + case PASS_SHADOW_CATCHER: + pass_info.num_components = 3; + pass_info.use_exposure = true; + pass_info.use_compositing = true; + pass_info.use_denoising_albedo = false; + pass_info.support_denoise = true; + break; + case PASS_SHADOW_CATCHER_SAMPLE_COUNT: + pass_info.num_components = 1; + break; + case PASS_SHADOW_CATCHER_MATTE: + pass_info.num_components = 4; + pass_info.use_exposure = true; + pass_info.support_denoise = true; + /* Without shadow catcher approximation compositing is not needed. + * Since we don't know here whether approximation is used or not, leave the decision up to + * the caller which will know that. */ + break; + + case PASS_ADAPTIVE_AUX_BUFFER: + pass_info.num_components = 4; + break; + case PASS_SAMPLE_COUNT: + pass_info.num_components = 1; + pass_info.use_exposure = false; + break; + + case PASS_AOV_COLOR: + pass_info.num_components = 3; + break; + case PASS_AOV_VALUE: + pass_info.num_components = 1; + break; + + case PASS_BAKE_PRIMITIVE: + case PASS_BAKE_DIFFERENTIAL: + pass_info.num_components = 4; + pass_info.use_exposure = false; + pass_info.use_filter = false; + break; + + case PASS_CATEGORY_LIGHT_END: + case PASS_CATEGORY_DATA_END: + case PASS_CATEGORY_BAKE_END: + case PASS_NUM: + LOG(DFATAL) << "Unexpected pass type is used " << type; + pass_info.num_components = 0; + break; + } + + return pass_info; +} + +bool Pass::contains(const vector<Pass *> &passes, PassType type) +{ + for (const Pass *pass : passes) { + if (pass->get_type() != type) { + continue; + } + + return true; + } + + return false; +} + +const Pass *Pass::find(const vector<Pass *> &passes, const string &name) +{ + for (const Pass *pass : passes) { + if (pass->get_name() == name) { + return pass; + } + } + + return nullptr; +} + +const Pass *Pass::find(const vector<Pass *> &passes, PassType type, PassMode mode) +{ + for (const Pass *pass : passes) { + if (pass->get_type() != type || pass->get_mode() != mode) { + continue; + } + + return pass; + } + + return nullptr; +} + +int Pass::get_offset(const vector<Pass *> &passes, const Pass *pass) +{ + int pass_offset = 0; + + for (const Pass *current_pass : passes) { + /* Note that pass name is allowed to be empty. This is why we check for type and mode. */ + if (current_pass->get_type() == pass->get_type() && + current_pass->get_mode() == pass->get_mode() && + current_pass->get_name() == pass->get_name()) { + if (current_pass->is_written()) { + return pass_offset; + } + else { + return PASS_UNUSED; + } + } + if (current_pass->is_written()) { + pass_offset += current_pass->get_info().num_components; + } + } + + return PASS_UNUSED; +} + +std::ostream &operator<<(std::ostream &os, const Pass &pass) +{ + os << "type: " << pass_type_as_string(pass.get_type()); + os << ", name: \"" << pass.get_name() << "\""; + os << ", mode: " << pass.get_mode(); + os << ", is_written: " << string_from_bool(pass.is_written()); + + return os; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/pass.h b/intern/cycles/render/pass.h new file mode 100644 index 00000000000..82230c62cb0 --- /dev/null +++ b/intern/cycles/render/pass.h @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <ostream> // NOLINT + +#include "util/util_string.h" +#include "util/util_vector.h" + +#include "kernel/kernel_types.h" + +#include "graph/node.h" + +CCL_NAMESPACE_BEGIN + +const char *pass_type_as_string(const PassType type); + +enum class PassMode { + NOISY, + DENOISED, +}; +const char *pass_mode_as_string(PassMode mode); +std::ostream &operator<<(std::ostream &os, PassMode mode); + +struct PassInfo { + int num_components = -1; + bool use_filter = false; + bool use_exposure = false; + bool is_written = true; + PassType divide_type = PASS_NONE; + PassType direct_type = PASS_NONE; + PassType indirect_type = PASS_NONE; + + /* Pass access for read can not happen directly and needs some sort of compositing (for example, + * light passes due to divide_type, or shadow catcher pass. */ + bool use_compositing = false; + + /* Used to disable albedo pass for denoising. + * Light and shadow catcher passes should not have discontinuity in the denoised result based on + * the underlying albedo. */ + bool use_denoising_albedo = true; + + /* Pass supports denoising. */ + bool support_denoise = false; +}; + +class Pass : public Node { + public: + NODE_DECLARE + + NODE_SOCKET_API(PassType, type) + NODE_SOCKET_API(PassMode, mode) + NODE_SOCKET_API(ustring, name) + NODE_SOCKET_API(bool, include_albedo) + + Pass(); + + PassInfo get_info() const; + + /* The pass is written by the render pipeline (kernel or denoiser). If the pass is written it + * will have pixels allocated in a RenderBuffer. Passes which are not written do not have their + * pixels allocated to save memory. */ + bool is_written() const; + + protected: + /* The has been created automatically as a requirement to various rendering functionality (such + * as adaptive sampling). */ + bool is_auto_; + + public: + static const NodeEnum *get_type_enum(); + static const NodeEnum *get_mode_enum(); + + static PassInfo get_info(PassType type, const bool include_albedo = false); + + static bool contains(const vector<Pass *> &passes, PassType type); + + /* Returns nullptr if there is no pass with the given name or type+mode. */ + static const Pass *find(const vector<Pass *> &passes, const string &name); + static const Pass *find(const vector<Pass *> &passes, + PassType type, + PassMode mode = PassMode::NOISY); + + /* Returns PASS_UNUSED if there is no corresponding pass. */ + static int get_offset(const vector<Pass *> &passes, const Pass *pass); + + friend class Film; +}; + +std::ostream &operator<<(std::ostream &os, const Pass &pass); + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index c4e7d2c79d6..a4b030190dc 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -163,12 +163,15 @@ void Scene::free_memory(bool final) delete p; foreach (Light *l, lights) delete l; + foreach (Pass *p, passes) + delete p; geometry.clear(); objects.clear(); lights.clear(); particle_systems.clear(); procedurals.clear(); + passes.clear(); if (device) { camera->device_free(device, &dscene, this); @@ -253,7 +256,6 @@ void Scene::device_update(Device *device_, Progress &progress) * - Camera may be used for adaptive subdivision. * - Displacement shader must have all shader data available. * - Light manager needs lookup tables and final mesh data to compute emission CDF. - * - Film needs light manager to run for use_light_visibility * - Lookup tables are done a second time to handle film tables */ @@ -469,88 +471,110 @@ void Scene::enable_update_stats() } } -DeviceRequestedFeatures Scene::get_requested_device_features() +void Scene::update_kernel_features() { - DeviceRequestedFeatures requested_features; + if (!need_update()) { + return; + } - shader_manager->get_requested_features(this, &requested_features); + /* These features are not being tweaked as often as shaders, + * so could be done selective magic for the viewport as well. */ + uint kernel_features = shader_manager->get_kernel_features(this); - /* This features are not being tweaked as often as shaders, - * so could be done selective magic for the viewport as well. - */ bool use_motion = need_motion() == Scene::MotionType::MOTION_BLUR; - requested_features.use_hair = false; - requested_features.use_hair_thick = (params.hair_shape == CURVE_THICK); - requested_features.use_object_motion = false; - requested_features.use_camera_motion = use_motion && camera->use_motion(); + kernel_features |= KERNEL_FEATURE_PATH_TRACING; + if (params.hair_shape == CURVE_THICK) { + kernel_features |= KERNEL_FEATURE_HAIR_THICK; + } + if (use_motion && camera->use_motion()) { + kernel_features |= KERNEL_FEATURE_CAMERA_MOTION; + } foreach (Object *object, objects) { Geometry *geom = object->get_geometry(); if (use_motion) { - requested_features.use_object_motion |= object->use_motion() | geom->get_use_motion_blur(); - requested_features.use_camera_motion |= geom->get_use_motion_blur(); + if (object->use_motion() || geom->get_use_motion_blur()) { + kernel_features |= KERNEL_FEATURE_OBJECT_MOTION; + } + if (geom->get_use_motion_blur()) { + kernel_features |= KERNEL_FEATURE_CAMERA_MOTION; + } } if (object->get_is_shadow_catcher()) { - requested_features.use_shadow_tricks = true; + kernel_features |= KERNEL_FEATURE_SHADOW_CATCHER; } if (geom->is_mesh()) { Mesh *mesh = static_cast<Mesh *>(geom); #ifdef WITH_OPENSUBDIV if (mesh->get_subdivision_type() != Mesh::SUBDIVISION_NONE) { - requested_features.use_patch_evaluation = true; + kernel_features |= KERNEL_FEATURE_PATCH_EVALUATION; } #endif - requested_features.use_true_displacement |= mesh->has_true_displacement(); } else if (geom->is_hair()) { - requested_features.use_hair = true; + kernel_features |= KERNEL_FEATURE_HAIR; } } - requested_features.use_background_light = light_manager->has_background_light(this); - - requested_features.use_baking = bake_manager->get_baking(); - requested_features.use_integrator_branched = (integrator->get_method() == - Integrator::BRANCHED_PATH); - if (film->get_denoising_data_pass()) { - requested_features.use_denoising = true; - requested_features.use_shadow_tricks = true; + if (bake_manager->get_baking()) { + kernel_features |= KERNEL_FEATURE_BAKING; } - return requested_features; -} + kernel_features |= film->get_kernel_features(this); -bool Scene::update(Progress &progress, bool &kernel_switch_needed) -{ - /* update scene */ - if (need_update()) { - /* Update max_closures. */ - KernelIntegrator *kintegrator = &dscene.data.integrator; - if (params.background) { - kintegrator->max_closures = get_max_closure_count(); - } - else { - /* Currently viewport render is faster with higher max_closures, needs investigating. */ - kintegrator->max_closures = MAX_CLOSURE; - } - - /* Load render kernels, before device update where we upload data to the GPU. */ - bool new_kernels_needed = load_kernels(progress, false); - - progress.set_status("Updating Scene"); - MEM_GUARDED_CALL(&progress, device_update, device, progress); + dscene.data.kernel_features = kernel_features; - DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state(); - kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE || - kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID; - if (new_kernels_needed || kernel_switch_needed) { - progress.set_kernel_status("Compiling render kernels"); - device->wait_for_availability(loaded_kernel_features); - progress.set_kernel_status(""); - } + /* Currently viewport render is faster with higher max_closures, needs investigating. */ + const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE; + dscene.data.max_closures = max_closures; + dscene.data.max_shaders = shaders.size(); +} - return true; +bool Scene::update(Progress &progress) +{ + if (!need_update()) { + return false; } - return false; + + /* Load render kernels, before device update where we upload data to the GPU. */ + load_kernels(progress, false); + + /* Upload scene data to the GPU. */ + progress.set_status("Updating Scene"); + MEM_GUARDED_CALL(&progress, device_update, device, progress); + + return true; +} + +static void log_kernel_features(const uint features) +{ + VLOG(2) << "Requested features:\n"; + VLOG(2) << "Use BSDF " << string_from_bool(features & KERNEL_FEATURE_NODE_BSDF) << "\n"; + VLOG(2) << "Use Principled BSDF " << string_from_bool(features & KERNEL_FEATURE_PRINCIPLED) + << "\n"; + VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n"; + VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n"; + VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n"; + VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n"; + VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA) + << "\n"; + VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE) + << "\n"; + VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n"; + VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n"; + VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING) + << "\n"; + VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_HAIR) << "\n"; + VLOG(2) << "Use Object Motion " << string_from_bool(features & KERNEL_FEATURE_OBJECT_MOTION) + << "\n"; + VLOG(2) << "Use Camera Motion " << string_from_bool(features & KERNEL_FEATURE_CAMERA_MOTION) + << "\n"; + VLOG(2) << "Use Baking " << string_from_bool(features & KERNEL_FEATURE_BAKING) << "\n"; + VLOG(2) << "Use Subsurface " << string_from_bool(features & KERNEL_FEATURE_SUBSURFACE) << "\n"; + VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_VOLUME) << "\n"; + VLOG(2) << "Use Patch Evaluation " + << string_from_bool(features & KERNEL_FEATURE_PATCH_EVALUATION) << "\n"; + VLOG(2) << "Use Shadow Catcher " << string_from_bool(features & KERNEL_FEATURE_SHADOW_CATCHER) + << "\n"; } bool Scene::load_kernels(Progress &progress, bool lock_scene) @@ -560,15 +584,15 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene) scene_lock = thread_scoped_lock(mutex); } - DeviceRequestedFeatures requested_features = get_requested_device_features(); + const uint kernel_features = dscene.data.kernel_features; - if (!kernels_loaded || loaded_kernel_features.modified(requested_features)) { + if (!kernels_loaded || loaded_kernel_features != kernel_features) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); scoped_timer timer; - VLOG(2) << "Requested features:\n" << requested_features; - if (!device->load_kernels(requested_features)) { + log_kernel_features(kernel_features); + if (!device->load_kernels(kernel_features)) { string message = device->error_message(); if (message.empty()) message = "Failed loading render kernel, see console for errors"; @@ -580,7 +604,7 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene) } kernels_loaded = true; - loaded_kernel_features = requested_features; + loaded_kernel_features = kernel_features; return true; } return false; @@ -618,6 +642,28 @@ int Scene::get_max_closure_count() return max_closure_global; } +bool Scene::has_shadow_catcher() +{ + if (shadow_catcher_modified_) { + has_shadow_catcher_ = false; + for (Object *object : objects) { + if (object->get_is_shadow_catcher()) { + has_shadow_catcher_ = true; + break; + } + } + + shadow_catcher_modified_ = false; + } + + return has_shadow_catcher_; +} + +void Scene::tag_shadow_catcher_modified() +{ + shadow_catcher_modified_ = true; +} + template<> Light *Scene::create_node<Light>() { Light *node = new Light(); @@ -694,6 +740,15 @@ template<> AlembicProcedural *Scene::create_node<AlembicProcedural>() #endif } +template<> Pass *Scene::create_node<Pass>() +{ + Pass *node = new Pass(); + node->set_owner(this); + passes.push_back(node); + film->tag_modified(); + return node; +} + template<typename T> void delete_node_from_array(vector<T> &nodes, T node) { for (size_t i = 0; i < nodes.size(); ++i) { @@ -779,6 +834,12 @@ template<> void Scene::delete_node_impl(AlembicProcedural *node) #endif } +template<> void Scene::delete_node_impl(Pass *node) +{ + delete_node_from_array(passes, node); + film->tag_modified(); +} + template<typename T> static void remove_nodes_in_set(const set<T *> &nodes_set, vector<T *> &nodes_array, @@ -842,4 +903,10 @@ template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOw procedural_manager->tag_update(); } +template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner) +{ + remove_nodes_in_set(nodes, passes, owner); + film->tag_modified(); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 7d8a6774381..cf4a3ba6b12 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -128,7 +128,7 @@ class DeviceScene { device_vector<float> lookup_table; /* integrator */ - device_vector<uint> sample_pattern_lut; + device_vector<float> sample_pattern_lut; /* ies lights */ device_vector<float> ies_lights; @@ -142,27 +142,6 @@ class DeviceScene { class SceneParams { public: - /* Type of BVH, in terms whether it is supported dynamic updates of meshes - * or whether modifying geometry requires full BVH rebuild. - */ - enum BVHType { - /* BVH supports dynamic updates of geometry. - * - * Faster for updating BVH tree when doing modifications in viewport, - * but slower for rendering. - */ - BVH_DYNAMIC = 0, - /* BVH tree is calculated for specific scene, updates in geometry - * requires full tree rebuild. - * - * Slower to update BVH tree when modifying objects in viewport, also - * slower to build final BVH tree but gives best possible render speed. - */ - BVH_STATIC = 1, - - BVH_NUM_TYPES, - }; - ShadingSystem shadingsystem; /* Requested BVH layout. @@ -186,7 +165,7 @@ class SceneParams { { shadingsystem = SHADINGSYSTEM_SVM; bvh_layout = BVH_LAYOUT_BVH2; - bvh_type = BVH_DYNAMIC; + bvh_type = BVH_TYPE_DYNAMIC; use_bvh_spatial_split = false; use_bvh_unaligned_nodes = true; num_bvh_time_steps = 0; @@ -196,7 +175,7 @@ class SceneParams { background = true; } - bool modified(const SceneParams ¶ms) + bool modified(const SceneParams ¶ms) const { return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout && bvh_type == params.bvh_type && @@ -236,7 +215,7 @@ class Scene : public NodeOwner { vector<Shader *> shaders; vector<Light *> lights; vector<ParticleSystem *> particle_systems; - vector<Pass> passes; + vector<Pass *> passes; vector<Procedural *> procedurals; /* data managers */ @@ -291,7 +270,11 @@ class Scene : public NodeOwner { void enable_update_stats(); - bool update(Progress &progress, bool &kernel_switch_needed); + void update_kernel_features(); + bool update(Progress &progress); + + bool has_shadow_catcher(); + void tag_shadow_catcher_modified(); /* This function is used to create a node of a specified type instead of * calling 'new', and sets the scene as the owner of the node. @@ -348,13 +331,12 @@ class Scene : public NodeOwner { void free_memory(bool final); bool kernels_loaded; - DeviceRequestedFeatures loaded_kernel_features; + uint loaded_kernel_features; bool load_kernels(Progress &progress, bool lock_scene = true); - /* ** Split kernel routines ** */ - - DeviceRequestedFeatures get_requested_device_features(); + bool has_shadow_catcher_ = false; + bool shadow_catcher_modified_ = true; /* Maximum number of closure during session lifetime. */ int max_closure_global; @@ -384,6 +366,8 @@ template<> Shader *Scene::create_node<Shader>(); template<> AlembicProcedural *Scene::create_node<AlembicProcedural>(); +template<> Pass *Scene::create_node<Pass>(); + template<> void Scene::delete_node_impl(Light *node); template<> void Scene::delete_node_impl(Mesh *node); @@ -404,6 +388,8 @@ template<> void Scene::delete_node_impl(Procedural *node); template<> void Scene::delete_node_impl(AlembicProcedural *node); +template<> void Scene::delete_node_impl(Pass *node); + template<> void Scene::delete_nodes(const set<Light *> &nodes, const NodeOwner *owner); template<> void Scene::delete_nodes(const set<Geometry *> &nodes, const NodeOwner *owner); @@ -416,6 +402,8 @@ template<> void Scene::delete_nodes(const set<Shader *> &nodes, const NodeOwner template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOwner *owner); +template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner); + CCL_NAMESPACE_END #endif /* __SCENE_H__ */ diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 1b91c49f0ea..84407f8e6dd 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -17,10 +17,15 @@ #include <limits.h> #include <string.h> +#include "device/cpu/device.h" #include "device/device.h" +#include "integrator/pass_accessor_cpu.h" +#include "integrator/path_trace.h" +#include "render/background.h" #include "render/bake.h" #include "render/buffers.h" #include "render/camera.h" +#include "render/gpu_display.h" #include "render/graph.h" #include "render/integrator.h" #include "render/light.h" @@ -39,70 +44,63 @@ CCL_NAMESPACE_BEGIN -/* Note about preserve_tile_device option for tile manager: - * progressive refine and viewport rendering does requires tiles to - * always be allocated for the same device - */ -Session::Session(const SessionParams ¶ms_) - : params(params_), - tile_manager(params.progressive, - params.samples, - params.tile_size, - params.start_resolution, - params.background == false || params.progressive_refine, - params.background, - params.tile_order, - max(params.device.multi_devices.size(), 1), - params.pixel_size), - stats(), - profiler() +Session::Session(const SessionParams ¶ms_, const SceneParams &scene_params) + : params(params_), render_scheduler_(tile_manager_, params) { - device_use_gl_ = ((params.device.type != DEVICE_CPU) && !params.background); - TaskScheduler::init(params.threads); - session_thread_ = NULL; - scene = NULL; - - reset_time_ = 0.0; - last_update_time_ = 0.0; + session_thread_ = nullptr; delayed_reset_.do_reset = false; - delayed_reset_.samples = 0; - - display_outdated_ = false; - gpu_draw_ready_ = false; - gpu_need_display_buffer_update_ = false; pause_ = false; cancel_ = false; new_work_added_ = false; - buffers = NULL; - display = NULL; + device = Device::create(params.device, stats, profiler); - /* Validate denoising parameters. */ - set_denoising(params.denoising); + scene = new Scene(scene_params, device); - /* Create CPU/GPU devices. */ - device = Device::create(params.device, stats, profiler, params.background); - - if (!device->error_message().empty()) { - progress.set_error(device->error_message()); - return; - } + /* Configure path tracer. */ + path_trace_ = make_unique<PathTrace>( + device, scene->film, &scene->dscene, render_scheduler_, tile_manager_); + path_trace_->set_progress(&progress); + path_trace_->tile_buffer_update_cb = [&]() { + if (!update_render_tile_cb) { + return; + } + update_render_tile_cb(); + }; + path_trace_->tile_buffer_write_cb = [&]() { + if (!write_render_tile_cb) { + return; + } + write_render_tile_cb(); + }; + path_trace_->tile_buffer_read_cb = [&]() -> bool { + if (!read_render_tile_cb) { + return false; + } + read_render_tile_cb(); + return true; + }; + path_trace_->progress_update_cb = [&]() { update_status_time(); }; - /* Create buffers for interactive rendering. */ - if (!(params.background && !params.write_render_cb)) { - buffers = new RenderBuffers(device); - display = new DisplayBuffer(device, params.display_buffer_linear); - } + tile_manager_.full_buffer_written_cb = [&](string_view filename) { + if (!full_buffer_written_cb) { + return; + } + full_buffer_written_cb(filename); + }; } Session::~Session() { cancel(); + /* TODO(sergey): Bring the passes in viewport back. + * It is unclear why there is such an exception needed though. */ +#if 0 if (buffers && params.write_render_cb) { /* Copy to display buffer and write out image if requested */ delete display; @@ -116,12 +114,14 @@ Session::~Session() uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h); params.write_render_cb((uchar *)pixels, w, h, 4); } +#endif - /* clean up */ - tile_manager.device_free(); + /* Make sure path tracer is destroyed before the deviec. This is needed because destruction might + * need to access device for device memory free. */ + /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the + * pre-defined order. */ + path_trace_.reset(); - delete buffers; - delete display; delete scene; delete device; @@ -135,15 +135,16 @@ void Session::start() } } -void Session::cancel() +void Session::cancel(bool quick) { + if (quick && path_trace_) { + path_trace_->cancel(); + } + if (session_thread_) { /* wait for session thread to end */ progress.set_cancel("Exiting"); - gpu_need_display_buffer_update_ = false; - gpu_need_display_buffer_update_cond_.notify_all(); - { thread_scoped_lock pause_lock(pause_mutex_); pause_ = false; @@ -157,570 +158,43 @@ void Session::cancel() bool Session::ready_to_reset() { - double dt = time_dt() - reset_time_; - - if (!display_outdated_) - return (dt > params.reset_timeout); - else - return (dt > params.cancel_timeout); + return path_trace_->ready_to_reset(); } -/* GPU Session */ - -void Session::reset_gpu(BufferParams &buffer_params, int samples) +void Session::run_main_render_loop() { - thread_scoped_lock pause_lock(pause_mutex_); - - /* block for buffer access and reset immediately. we can't do this - * in the thread, because we need to allocate an OpenGL buffer, and - * that only works in the main thread */ - thread_scoped_lock display_lock(display_mutex_); - thread_scoped_lock buffers_lock(buffers_mutex_); + path_trace_->clear_gpu_display(); - display_outdated_ = true; - reset_time_ = time_dt(); + while (true) { + RenderWork render_work = run_update_for_next_iteration(); - reset_(buffer_params, samples); - - gpu_need_display_buffer_update_ = false; - gpu_need_display_buffer_update_cond_.notify_all(); - - new_work_added_ = true; - - pause_cond_.notify_all(); -} - -bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_params) -{ - /* block for buffer access */ - thread_scoped_lock display_lock(display_mutex_); - - /* first check we already rendered something */ - if (gpu_draw_ready_) { - /* then verify the buffers have the expected size, so we don't - * draw previous results in a resized window */ - if (buffer_params.width == display->params.width && - buffer_params.height == display->params.height) { - /* for CUDA we need to do tone-mapping still, since we can - * only access GL buffers from the main thread. */ - if (gpu_need_display_buffer_update_) { - thread_scoped_lock buffers_lock(buffers_mutex_); - copy_to_display_buffer(tile_manager.state.sample); - gpu_need_display_buffer_update_ = false; - gpu_need_display_buffer_update_cond_.notify_all(); + if (!render_work) { + if (VLOG_IS_ON(2)) { + double total_time, render_time; + progress.get_time(total_time, render_time); + VLOG(2) << "Rendering in main loop is done in " << render_time << " seconds."; + VLOG(2) << path_trace_->full_report(); } - display->draw(device, draw_params); - - if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout) - return false; - - return true; - } - } - - return false; -} - -void Session::run_gpu() -{ - bool tiles_written = false; - - reset_time_ = time_dt(); - last_update_time_ = time_dt(); - last_display_time_ = last_update_time_; - - progress.set_render_start_time(); - - while (!progress.get_cancel()) { - const bool no_tiles = !run_update_for_next_iteration(); - - if (no_tiles) { if (params.background) { - /* if no work left and in background mode, we can stop immediately */ + /* if no work left and in background mode, we can stop immediately. */ progress.set_status("Finished"); break; } } - if (run_wait_for_work(no_tiles)) { - continue; - } - - if (progress.get_cancel()) { - break; - } - - if (!no_tiles) { - if (!device->error_message().empty()) - progress.set_error(device->error_message()); - - if (progress.get_cancel()) - break; - - /* buffers mutex is locked entirely while rendering each - * sample, and released/reacquired on each iteration to allow - * reset and draw in between */ - thread_scoped_lock buffers_lock(buffers_mutex_); - - /* update status and timing */ - update_status_time(); - - /* render */ - bool delayed_denoise = false; - const bool need_denoise = render_need_denoise(delayed_denoise); - render(need_denoise); - - device->task_wait(); - - if (!device->error_message().empty()) - progress.set_cancel(device->error_message()); - - /* update status and timing */ - update_status_time(); - - gpu_need_display_buffer_update_ = !delayed_denoise; - gpu_draw_ready_ = true; - progress.set_update(); - - /* wait for until display buffer is updated */ - if (!params.background) { - while (gpu_need_display_buffer_update_) { - if (progress.get_cancel()) - break; - - gpu_need_display_buffer_update_cond_.wait(buffers_lock); - } - } - - if (!device->error_message().empty()) - progress.set_error(device->error_message()); - - tiles_written = update_progressive_refine(progress.get_cancel()); - - if (progress.get_cancel()) - break; - } - } - - if (!tiles_written) - update_progressive_refine(true); -} - -/* CPU Session */ - -void Session::reset_cpu(BufferParams &buffer_params, int samples) -{ - thread_scoped_lock reset_lock(delayed_reset_.mutex); - thread_scoped_lock pause_lock(pause_mutex_); - - display_outdated_ = true; - reset_time_ = time_dt(); - - delayed_reset_.params = buffer_params; - delayed_reset_.samples = samples; - delayed_reset_.do_reset = true; - device->task_cancel(); - - pause_cond_.notify_all(); -} - -bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_params) -{ - thread_scoped_lock display_lock(display_mutex_); - - /* first check we already rendered something */ - if (display->draw_ready()) { - /* then verify the buffers have the expected size, so we don't - * draw previous results in a resized window */ - if (buffer_params.width == display->params.width && - buffer_params.height == display->params.height) { - display->draw(device, draw_params); - - if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout) - return false; - - return true; - } - } - - return false; -} - -bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock) -{ - /* Devices that can get their tiles stolen don't steal tiles themselves. - * Additionally, if there are no stealable tiles in flight, give up here. */ - if (tile_device->info.type == DEVICE_CPU || stealable_tiles_ == 0) { - return false; - } - - /* Wait until no other thread is trying to steal a tile. */ - while (tile_stealing_state_ != NOT_STEALING && stealable_tiles_ > 0) { - /* Someone else is currently trying to get a tile. - * Wait on the condition variable and try later. */ - tile_steal_cond_.wait(tile_lock); - } - /* If another thread stole the last stealable tile in the meantime, give up. */ - if (stealable_tiles_ == 0) { - return false; - } - - /* There are stealable tiles in flight, so signal that one should be released. */ - tile_stealing_state_ = WAITING_FOR_TILE; - - /* Wait until a device notices the signal and releases its tile. */ - while (tile_stealing_state_ != GOT_TILE && stealable_tiles_ > 0) { - tile_steal_cond_.wait(tile_lock); - } - /* If the last stealable tile finished on its own, give up. */ - if (tile_stealing_state_ != GOT_TILE) { - tile_stealing_state_ = NOT_STEALING; - return false; - } - - /* Successfully stole a tile, now move it to the new device. */ - rtile = stolen_tile_; - rtile.buffers->buffer.move_device(tile_device); - rtile.buffer = rtile.buffers->buffer.device_pointer; - rtile.stealing_state = RenderTile::NO_STEALING; - rtile.num_samples -= (rtile.sample - rtile.start_sample); - rtile.start_sample = rtile.sample; - - tile_stealing_state_ = NOT_STEALING; - - /* Poke any threads which might be waiting for NOT_STEALING above. */ - tile_steal_cond_.notify_one(); - - return true; -} - -bool Session::get_tile_stolen() -{ - /* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE - * and return true. */ - TileStealingState expected = WAITING_FOR_TILE; - return tile_stealing_state_.compare_exchange_weak(expected, RELEASING_TILE); -} - -bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types) -{ - if (progress.get_cancel()) { - if (params.progressive_refine == false) { - /* for progressive refine current sample should be finished for all tiles */ - return false; - } - } - - thread_scoped_lock tile_lock(tile_mutex_); - - /* get next tile from manager */ - Tile *tile; - int device_num = device->device_number(tile_device); - - while (!tile_manager.next_tile(tile, device_num, tile_types)) { - /* Can only steal tiles on devices that support rendering - * This is because denoising tiles cannot be stolen (see below) - */ - if ((tile_types & (RenderTile::PATH_TRACE | RenderTile::BAKE)) && - steal_tile(rtile, tile_device, tile_lock)) { - return true; - } - - /* Wait for denoising tiles to become available */ - if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) { - denoising_cond_.wait(tile_lock); - continue; - } - - return false; - } - - /* fill render tile */ - rtile.x = tile_manager.state.buffer.full_x + tile->x; - rtile.y = tile_manager.state.buffer.full_y + tile->y; - rtile.w = tile->w; - rtile.h = tile->h; - rtile.start_sample = tile_manager.state.sample; - rtile.num_samples = tile_manager.state.num_samples; - rtile.resolution = tile_manager.state.resolution_divider; - rtile.tile_index = tile->index; - rtile.stealing_state = RenderTile::NO_STEALING; - - if (tile->state == Tile::DENOISE) { - rtile.task = RenderTile::DENOISE; - } - else { - if (tile_device->info.type == DEVICE_CPU) { - stealable_tiles_++; - rtile.stealing_state = RenderTile::CAN_BE_STOLEN; - } - - if (read_bake_tile_cb) { - rtile.task = RenderTile::BAKE; - } - else { - rtile.task = RenderTile::PATH_TRACE; - } - } - - tile_lock.unlock(); - - /* in case of a permanent buffer, return it, otherwise we will allocate - * a new temporary buffer */ - if (buffers) { - tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride); - - rtile.buffer = buffers->buffer.device_pointer; - rtile.buffers = buffers; - - device->map_tile(tile_device, rtile); - - /* Reset copy state, since buffer contents change after the tile was acquired */ - buffers->map_neighbor_copied = false; - - /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts - * for the buffer resolution divider. */ - buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) / - tile_manager.state.resolution_divider; - buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider; - - return true; - } - - if (tile->buffers == NULL) { - /* fill buffer parameters */ - BufferParams buffer_params = tile_manager.params; - buffer_params.full_x = rtile.x; - buffer_params.full_y = rtile.y; - buffer_params.width = rtile.w; - buffer_params.height = rtile.h; - - /* allocate buffers */ - tile->buffers = new RenderBuffers(tile_device); - tile->buffers->reset(buffer_params); - } - else if (tile->buffers->buffer.device != tile_device) { - /* Move buffer to current tile device again in case it was stolen before. - * Not needed for denoising since that already handles mapping of tiles and - * neighbors to its own device. */ - if (rtile.task != RenderTile::DENOISE) { - tile->buffers->buffer.move_device(tile_device); - } - } - - tile->buffers->map_neighbor_copied = false; - - tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride); - - rtile.buffer = tile->buffers->buffer.device_pointer; - rtile.buffers = tile->buffers; - rtile.sample = tile_manager.state.sample; - - if (read_bake_tile_cb) { - /* This will read any passes needed as input for baking. */ - if (tile_manager.state.sample == tile_manager.range_start_sample) { - { - thread_scoped_lock tile_lock(tile_mutex_); - read_bake_tile_cb(rtile); - } - rtile.buffers->buffer.copy_to_device(); - } - } - else { - /* This will tag tile as IN PROGRESS in blender-side render pipeline, - * which is needed to highlight currently rendering tile before first - * sample was processed for it. */ - update_tile_sample(rtile); - } - - return true; -} - -void Session::update_tile_sample(RenderTile &rtile) -{ - thread_scoped_lock tile_lock(tile_mutex_); - - if (update_render_tile_cb) { - if (params.progressive_refine == false) { - /* todo: optimize this by making it thread safe and removing lock */ - - update_render_tile_cb(rtile, true); - } - } - - update_status_time(); -} - -void Session::release_tile(RenderTile &rtile, const bool need_denoise) -{ - thread_scoped_lock tile_lock(tile_mutex_); - - if (rtile.stealing_state != RenderTile::NO_STEALING) { - stealable_tiles_--; - if (rtile.stealing_state == RenderTile::WAS_STOLEN) { - /* If the tile is being stolen, don't release it here - the new device will pick up where - * the old one left off. */ - - assert(tile_stealing_state_ == RELEASING_TILE); - assert(rtile.sample < rtile.start_sample + rtile.num_samples); - - tile_stealing_state_ = GOT_TILE; - stolen_tile_ = rtile; - tile_steal_cond_.notify_all(); - return; - } - else if (stealable_tiles_ == 0) { - /* If this was the last stealable tile, wake up any threads still waiting for one. */ - tile_steal_cond_.notify_all(); - } - } - - progress.add_finished_tile(rtile.task == RenderTile::DENOISE); - - bool delete_tile; - - if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) { - /* Finished tile pixels write. */ - if (write_render_tile_cb && params.progressive_refine == false) { - write_render_tile_cb(rtile); - } - - if (delete_tile) { - delete rtile.buffers; - tile_manager.state.tiles[rtile.tile_index].buffers = NULL; - } - } - else { - /* In progress tile pixels update. */ - if (update_render_tile_cb && params.progressive_refine == false) { - update_render_tile_cb(rtile, false); - } - } - - update_status_time(); - - /* Notify denoising thread that a tile was finished. */ - denoising_cond_.notify_all(); -} - -void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device) -{ - thread_scoped_lock tile_lock(tile_mutex_); - - const int4 image_region = make_int4( - tile_manager.state.buffer.full_x, - tile_manager.state.buffer.full_y, - tile_manager.state.buffer.full_x + tile_manager.state.buffer.width, - tile_manager.state.buffer.full_y + tile_manager.state.buffer.height); - - RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; - - if (!tile_manager.schedule_denoising) { - /* Fix up tile slices with overlap. */ - if (tile_manager.slice_overlap != 0) { - int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y); - center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap, - image_region.w) - - y; - center_tile.y = y; - } - - /* Tiles are not being denoised individually, which means the entire image is processed. */ - neighbors.set_bounds_from_center(); - } - else { - int center_idx = center_tile.tile_index; - assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE); - - for (int dy = -1, i = 0; dy <= 1; dy++) { - for (int dx = -1; dx <= 1; dx++, i++) { - RenderTile &rtile = neighbors.tiles[i]; - int nindex = tile_manager.get_neighbor_index(center_idx, i); - if (nindex >= 0) { - Tile *tile = &tile_manager.state.tiles[nindex]; - - rtile.x = image_region.x + tile->x; - rtile.y = image_region.y + tile->y; - rtile.w = tile->w; - rtile.h = tile->h; - - if (buffers) { - tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride); - - rtile.buffer = buffers->buffer.device_pointer; - rtile.buffers = buffers; - } - else { - assert(tile->buffers); - tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride); - - rtile.buffer = tile->buffers->buffer.device_pointer; - rtile.buffers = tile->buffers; - } - } - else { - int px = center_tile.x + dx * params.tile_size.x; - int py = center_tile.y + dy * params.tile_size.y; - - rtile.x = clamp(px, image_region.x, image_region.z); - rtile.y = clamp(py, image_region.y, image_region.w); - rtile.w = rtile.h = 0; - - rtile.buffer = (device_ptr)NULL; - rtile.buffers = NULL; - } - } - } - } - - assert(center_tile.buffers); - device->map_neighbor_tiles(tile_device, neighbors); - - /* The denoised result is written back to the original tile. */ - neighbors.target = center_tile; -} - -void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device) -{ - thread_scoped_lock tile_lock(tile_mutex_); - device->unmap_neighbor_tiles(tile_device, neighbors); -} - -void Session::run_cpu() -{ - bool tiles_written = false; - - last_update_time_ = time_dt(); - last_display_time_ = last_update_time_; - - while (!progress.get_cancel()) { - const bool no_tiles = !run_update_for_next_iteration(); - bool need_copy_to_display_buffer = false; - - if (no_tiles) { - if (params.background) { - /* if no work left and in background mode, we can stop immediately */ - progress.set_status("Finished"); + const bool did_cancel = progress.get_cancel(); + if (did_cancel) { + render_scheduler_.render_work_reschedule_on_cancel(render_work); + if (!render_work) { break; } } - - if (run_wait_for_work(no_tiles)) { + else if (run_wait_for_work(render_work)) { continue; } - if (progress.get_cancel()) { - break; - } - - if (!no_tiles) { - if (!device->error_message().empty()) - progress.set_error(device->error_message()); - - if (progress.get_cancel()) - break; - + { /* buffers mutex is locked entirely while rendering each * sample, and released/reacquired on each iteration to allow * reset and draw in between */ @@ -730,49 +204,25 @@ void Session::run_cpu() update_status_time(); /* render */ - bool delayed_denoise = false; - const bool need_denoise = render_need_denoise(delayed_denoise); - render(need_denoise); + path_trace_->render(render_work); /* update status and timing */ update_status_time(); - if (!params.background) - need_copy_to_display_buffer = !delayed_denoise; - - if (!device->error_message().empty()) - progress.set_error(device->error_message()); - } - - device->task_wait(); - - { - thread_scoped_lock reset_lock(delayed_reset_.mutex); - thread_scoped_lock buffers_lock(buffers_mutex_); - thread_scoped_lock display_lock(display_mutex_); - - if (delayed_reset_.do_reset) { - /* reset rendering if request from main thread */ - delayed_reset_.do_reset = false; - reset_(delayed_reset_.params, delayed_reset_.samples); - } - else if (need_copy_to_display_buffer) { - /* Only copy to display_buffer if we do not reset, we don't - * want to show the result of an incomplete sample */ - copy_to_display_buffer(tile_manager.state.sample); + if (device->have_error()) { + const string &error_message = device->error_message(); + progress.set_error(error_message); + progress.set_cancel(error_message); + break; } - - if (!device->error_message().empty()) - progress.set_error(device->error_message()); - - tiles_written = update_progressive_refine(progress.get_cancel()); } progress.set_update(); - } - if (!tiles_written) - update_progressive_refine(true); + if (did_cancel) { + break; + } + } } void Session::run() @@ -789,10 +239,7 @@ void Session::run() /* reset number of rendered samples */ progress.reset_sample(); - if (device_use_gl_) - run_gpu(); - else - run_cpu(); + run_main_render_loop(); } profiler.stop(); @@ -804,31 +251,92 @@ void Session::run() progress.set_update(); } -bool Session::run_update_for_next_iteration() +RenderWork Session::run_update_for_next_iteration() { + RenderWork render_work; + thread_scoped_lock scene_lock(scene->mutex); thread_scoped_lock reset_lock(delayed_reset_.mutex); + bool have_tiles = true; + bool switched_to_new_tile = false; + if (delayed_reset_.do_reset) { thread_scoped_lock buffers_lock(buffers_mutex_); - reset_(delayed_reset_.params, delayed_reset_.samples); - delayed_reset_.do_reset = false; + do_delayed_reset(); + + /* After reset make sure the tile manager is at the first big tile. */ + have_tiles = tile_manager_.next(); + switched_to_new_tile = true; + } + + /* Update number of samples in the integrator. + * Ideally this would need to happen once in `Session::set_samples()`, but the issue there is + * the initial configuration when Session is created where the `set_samples()` is not used. */ + scene->integrator->set_aa_samples(params.samples); + + /* Update denoiser settings. */ + { + const DenoiseParams denoise_params = scene->integrator->get_denoise_params(); + path_trace_->set_denoiser_params(denoise_params); + } + + /* Update adaptive sampling. */ + { + const AdaptiveSampling adaptive_sampling = scene->integrator->get_adaptive_sampling(); + path_trace_->set_adaptive_sampling(adaptive_sampling); } - const bool have_tiles = tile_manager.next(); + render_scheduler_.set_num_samples(params.samples); + render_scheduler_.set_time_limit(params.time_limit); + + while (have_tiles) { + render_work = render_scheduler_.get_render_work(); + if (render_work) { + break; + } - if (have_tiles) { + progress.add_finished_tile(false); + + have_tiles = tile_manager_.next(); + if (have_tiles) { + render_scheduler_.reset_for_next_tile(); + switched_to_new_tile = true; + } + } + + if (render_work) { scoped_timer update_timer; - if (update_scene()) { + + if (switched_to_new_tile) { + BufferParams tile_params = buffer_params_; + + const Tile &tile = tile_manager_.get_current_tile(); + tile_params.width = tile.width; + tile_params.height = tile.height; + tile_params.full_x = tile.x + buffer_params_.full_x; + tile_params.full_y = tile.y + buffer_params_.full_y; + tile_params.full_width = buffer_params_.full_width; + tile_params.full_height = buffer_params_.full_height; + tile_params.update_offset_stride(); + + path_trace_->reset(buffer_params_, tile_params); + } + + const int resolution = render_work.resolution_divider; + const int width = max(1, buffer_params_.full_width / resolution); + const int height = max(1, buffer_params_.full_height / resolution); + + if (update_scene(width, height)) { profiler.reset(scene->shaders.size(), scene->objects.size()); } progress.add_skip_time(update_timer, params.background); } - return have_tiles; + return render_work; } -bool Session::run_wait_for_work(bool no_tiles) +bool Session::run_wait_for_work(const RenderWork &render_work) { /* In an offline rendering there is no pause, and no tiles will mean the job is fully done. */ if (params.background) { @@ -837,19 +345,20 @@ bool Session::run_wait_for_work(bool no_tiles) thread_scoped_lock pause_lock(pause_mutex_); - if (!pause_ && !no_tiles) { + if (!pause_ && render_work) { /* Rendering is not paused and there is work to be done. No need to wait for anything. */ return false; } - update_status_time(pause_, no_tiles); + const bool no_work = !render_work; + update_status_time(pause_, no_work); /* Only leave the loop when rendering is not paused. But even if the current render is un-paused * but there is nothing to render keep waiting until new work is added. */ while (!cancel_) { scoped_timer pause_timer; - if (!pause_ && (!no_tiles || new_work_added_ || delayed_reset_.do_reset)) { + if (!pause_ && (render_work || new_work_added_ || delayed_reset_.do_reset)) { break; } @@ -860,52 +369,88 @@ bool Session::run_wait_for_work(bool no_tiles) progress.add_skip_time(pause_timer, params.background); } - update_status_time(pause_, no_tiles); + update_status_time(pause_, no_work); progress.set_update(); } new_work_added_ = false; - return no_tiles; + return no_work; } -bool Session::draw(BufferParams &buffer_params, DeviceDrawParams &draw_params) +void Session::draw() { - if (device_use_gl_) - return draw_gpu(buffer_params, draw_params); - else - return draw_cpu(buffer_params, draw_params); + path_trace_->draw(); } -void Session::reset_(BufferParams &buffer_params, int samples) +int2 Session::get_effective_tile_size() const { - if (buffers && buffer_params.modified(tile_manager.params)) { - gpu_draw_ready_ = false; - buffers->reset(buffer_params); - if (display) { - display->reset(buffer_params); - } + /* No support yet for baking with tiles. */ + if (!params.use_auto_tile || scene->bake_manager->get_baking()) { + return make_int2(buffer_params_.width, buffer_params_.height); } - tile_manager.reset(buffer_params, samples); - stealable_tiles_ = 0; - tile_stealing_state_ = NOT_STEALING; - progress.reset_sample(); + /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile + * and prefer optimal performance. */ + + return make_int2(params.tile_size, params.tile_size); +} + +void Session::do_delayed_reset() +{ + if (!delayed_reset_.do_reset) { + return; + } + delayed_reset_.do_reset = false; + + params = delayed_reset_.session_params; + buffer_params_ = delayed_reset_.buffer_params; + + /* Store parameters used for buffers access outside of scene graph. */ + buffer_params_.exposure = scene->film->get_exposure(); + buffer_params_.use_approximate_shadow_catcher = + scene->film->get_use_approximate_shadow_catcher(); + buffer_params_.use_transparent_background = scene->background->get_transparent(); - bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX; - progress.set_total_pixel_samples(show_progress ? tile_manager.state.total_pixel_samples : 0); + /* Tile and work scheduling. */ + tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size()); + render_scheduler_.reset(buffer_params_, params.samples); - if (!params.background) + /* Passes. */ + /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial + * tile results. It is safe to use generic update function here which checks for changes since + * changes in tile settings re-creates session, which ensures film is fully updated on tile + * changes. */ + scene->film->update_passes(scene, tile_manager_.has_multiple_tiles()); + + /* Update for new state of scene and passes. */ + buffer_params_.update_passes(scene->passes); + tile_manager_.update(buffer_params_, scene); + + /* Progress. */ + progress.reset_sample(); + progress.set_total_pixel_samples(buffer_params_.width * buffer_params_.height * params.samples); + + if (!params.background) { progress.set_start_time(); + } progress.set_render_start_time(); } -void Session::reset(BufferParams &buffer_params, int samples) +void Session::reset(const SessionParams &session_params, const BufferParams &buffer_params) { - if (device_use_gl_) - reset_gpu(buffer_params, samples); - else - reset_cpu(buffer_params, samples); + { + thread_scoped_lock reset_lock(delayed_reset_.mutex); + thread_scoped_lock pause_lock(pause_mutex_); + + delayed_reset_.do_reset = true; + delayed_reset_.session_params = session_params; + delayed_reset_.buffer_params = buffer_params; + + path_trace_->cancel(); + } + + pause_cond_.notify_all(); } void Session::set_samples(int samples) @@ -915,7 +460,22 @@ void Session::set_samples(int samples) } params.samples = samples; - tile_manager.set_samples(samples); + + { + thread_scoped_lock pause_lock(pause_mutex_); + new_work_added_ = true; + } + + pause_cond_.notify_all(); +} + +void Session::set_time_limit(double time_limit) +{ + if (time_limit == params.time_limit) { + return; + } + + params.time_limit = time_limit; { thread_scoped_lock pause_lock(pause_mutex_); @@ -948,38 +508,9 @@ void Session::set_pause(bool pause) } } -void Session::set_denoising(const DenoiseParams &denoising) +void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display) { - bool need_denoise = denoising.need_denoising_task(); - - /* Lock buffers so no denoising operation is triggered while the settings are changed here. */ - thread_scoped_lock buffers_lock(buffers_mutex_); - params.denoising = denoising; - - if (!(params.device.denoisers & denoising.type)) { - if (need_denoise) { - progress.set_error("Denoiser type not supported by compute device"); - } - - params.denoising.use = false; - need_denoise = false; - } - - // TODO(pmours): Query the required overlap value for denoising from the device? - tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0; - - /* Schedule per tile denoising for final renders if we are either denoising or - * need prefiltered passes for the native denoiser. */ - tile_manager.schedule_denoising = need_denoise && !buffers; -} - -void Session::set_denoising_start_sample(int sample) -{ - if (sample != params.denoising.start_sample) { - params.denoising.start_sample = sample; - - pause_cond_.notify_all(); - } + path_trace_->set_gpu_display(move(gpu_display)); } void Session::wait() @@ -989,81 +520,67 @@ void Session::wait() delete session_thread_; } - session_thread_ = NULL; + session_thread_ = nullptr; } -bool Session::update_scene() +bool Session::update_scene(int width, int height) { - /* update camera if dimensions changed for progressive render. the camera + /* Update camera if dimensions changed for progressive render. the camera * knows nothing about progressive or cropped rendering, it just gets the - * image dimensions passed in */ + * image dimensions passed in. */ Camera *cam = scene->camera; - int width = tile_manager.state.buffer.full_width; - int height = tile_manager.state.buffer.full_height; - int resolution = tile_manager.state.resolution_divider; - - cam->set_screen_size_and_resolution(width, height, resolution); + cam->set_screen_size(width, height); - /* number of samples is needed by multi jittered - * sampling pattern and by baking */ - Integrator *integrator = scene->integrator; - BakeManager *bake_manager = scene->bake_manager; + /* First detect which kernel features are used and allocate working memory. + * This helps estimate how may device memory is available for the scene and + * how much we need to allocate on the host instead. */ + scene->update_kernel_features(); - if (integrator->get_sampling_pattern() != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) { - integrator->set_aa_samples(tile_manager.num_samples); - } + path_trace_->load_kernels(); + path_trace_->alloc_work_memory(); - bool kernel_switch_needed = false; - if (scene->update(progress, kernel_switch_needed)) { - if (kernel_switch_needed) { - reset(tile_manager.params, params.samples); - } + if (scene->update(progress)) { return true; } + return false; } +static string status_append(const string &status, const string &suffix) +{ + string prefix = status; + if (!prefix.empty()) { + prefix += ", "; + } + return prefix + suffix; +} + void Session::update_status_time(bool show_pause, bool show_done) { - int progressive_sample = tile_manager.state.sample; - int num_samples = tile_manager.get_num_effective_samples(); + string status, substatus; - int tile = progress.get_rendered_tiles(); - int num_tiles = tile_manager.state.num_tiles; + const int current_tile = progress.get_rendered_tiles(); + const int num_tiles = tile_manager_.get_num_tiles(); - /* update status */ - string status, substatus; + const int current_sample = progress.get_current_sample(); + const int num_samples = render_scheduler_.get_num_samples(); - if (!params.progressive) { - const bool is_cpu = params.device.type == DEVICE_CPU; - const bool rendering_finished = (tile == num_tiles); - const bool is_last_tile = (tile + 1) == num_tiles; - - substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles); - - if (!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) { - /* Some devices automatically support showing the sample number: - * - CUDADevice - * - OpenCLDevice when using the megakernel (the split kernel renders multiple - * samples at the same time, so the current sample isn't really defined) - * - CPUDevice when using one thread - * For these devices, the current sample is always shown. - * - * The other option is when the last tile is currently being rendered by the CPU. - */ - substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); - } - if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) { - substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); - } - else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) { - substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles()); - } + /* TIle. */ + if (tile_manager_.has_multiple_tiles()) { + substatus = status_append(substatus, + string_printf("Rendered %d/%d Tiles", current_tile, num_tiles)); } - else if (tile_manager.num_samples == Integrator::MAX_SAMPLES) - substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1); - else - substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples); + + /* Sample. */ + if (num_samples == Integrator::MAX_SAMPLES) { + substatus = status_append(substatus, string_printf("Sample %d", current_sample)); + } + else { + substatus = status_append(substatus, + string_printf("Sample %d/%d", current_sample, num_samples)); + } + + /* TODO(sergey): Denoising status from the path trace. */ if (show_pause) { status = "Rendering Paused"; @@ -1080,210 +597,122 @@ void Session::update_status_time(bool show_pause, bool show_done) progress.set_status(status, substatus); } -bool Session::render_need_denoise(bool &delayed) +void Session::device_free() { - delayed = false; - - /* Not supported yet for baking. */ - if (read_bake_tile_cb) { - return false; - } - - /* Denoising enabled? */ - if (!params.denoising.need_denoising_task()) { - return false; - } - - if (params.background) { - /* Background render, only denoise when rendering the last sample. */ - return tile_manager.done(); - } - - /* Viewport render. */ - - /* It can happen that denoising was already enabled, but the scene still needs an update. */ - if (scene->film->is_modified() || !scene->film->get_denoising_data_offset()) { - return false; - } + scene->device_free(); + path_trace_->device_free(); +} - /* Immediately denoise when we reach the start sample or last sample. */ - const int num_samples_finished = tile_manager.state.sample + 1; - if (num_samples_finished == params.denoising.start_sample || - num_samples_finished == params.samples) { - return true; +void Session::collect_statistics(RenderStats *render_stats) +{ + scene->collect_statistics(render_stats); + if (params.use_profiling && (params.device.type == DEVICE_CPU)) { + render_stats->collect_profiling(scene, profiler); } +} - /* Do not denoise until the sample at which denoising should start is reached. */ - if (num_samples_finished < params.denoising.start_sample) { - return false; - } +/* -------------------------------------------------------------------- + * Tile and tile pixels aceess. + */ - /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */ - delayed = (tile_manager.state.sample >= 20 && - (time_dt() - last_display_time_) < params.progressive_update_timeout); - return !delayed; +bool Session::has_multiple_render_tiles() const +{ + return tile_manager_.has_multiple_tiles(); } -void Session::render(bool need_denoise) +int2 Session::get_render_tile_size() const { - if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) { - /* Clear buffers. */ - buffers->zero(); - } - - if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) { - return; /* Avoid empty launches. */ - } + return path_trace_->get_render_tile_size(); +} - /* Add path trace task. */ - DeviceTask task(DeviceTask::RENDER); - - task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3); - task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise); - task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2); - task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2); - task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); - task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); - task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2); - task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this); - task.need_finish_queue = params.progressive_refine; - task.integrator_branched = scene->integrator->get_method() == Integrator::BRANCHED_PATH; - - task.adaptive_sampling.use = (scene->integrator->get_sampling_pattern() == - SAMPLING_PATTERN_PMJ) && - scene->dscene.data.film.pass_adaptive_aux_buffer; - task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples; - task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step; - - /* Acquire render tiles by default. */ - task.tile_types = RenderTile::PATH_TRACE; - - if (need_denoise) { - task.denoising = params.denoising; - - task.pass_stride = scene->film->get_pass_stride(); - task.target_pass_stride = task.pass_stride; - task.pass_denoising_data = scene->film->get_denoising_data_offset(); - task.pass_denoising_clean = scene->film->get_denoising_clean_offset(); - - task.denoising_from_render = true; - - if (tile_manager.schedule_denoising) { - /* Acquire denoising tiles during rendering. */ - task.tile_types |= RenderTile::DENOISE; - } - else { - assert(buffers); - - /* Schedule rendering and wait for it to finish. */ - device->task_add(task); - device->task_wait(); - - /* Then run denoising on the whole image at once. */ - task.type = DeviceTask::DENOISE_BUFFER; - task.x = tile_manager.state.buffer.full_x; - task.y = tile_manager.state.buffer.full_y; - task.w = tile_manager.state.buffer.width; - task.h = tile_manager.state.buffer.height; - task.buffer = buffers->buffer.device_pointer; - task.sample = tile_manager.state.sample; - task.num_samples = tile_manager.state.num_samples; - tile_manager.state.buffer.get_offset_stride(task.offset, task.stride); - task.buffers = buffers; - } - } +int2 Session::get_render_tile_offset() const +{ + return path_trace_->get_render_tile_offset(); +} - device->task_add(task); +string_view Session::get_render_tile_layer() const +{ + const BufferParams &buffer_params = path_trace_->get_render_tile_params(); + return buffer_params.layer; } -void Session::copy_to_display_buffer(int sample) +string_view Session::get_render_tile_view() const { - /* add film conversion task */ - DeviceTask task(DeviceTask::FILM_CONVERT); - - task.x = tile_manager.state.buffer.full_x; - task.y = tile_manager.state.buffer.full_y; - task.w = tile_manager.state.buffer.width; - task.h = tile_manager.state.buffer.height; - task.rgba_byte = display->rgba_byte.device_pointer; - task.rgba_half = display->rgba_half.device_pointer; - task.buffer = buffers->buffer.device_pointer; - task.sample = sample; - tile_manager.state.buffer.get_offset_stride(task.offset, task.stride); - - if (task.w > 0 && task.h > 0) { - device->task_add(task); - device->task_wait(); - - /* set display to new size */ - display->draw_set(task.w, task.h); - - last_display_time_ = time_dt(); - } + const BufferParams &buffer_params = path_trace_->get_render_tile_params(); + return buffer_params.view; +} - display_outdated_ = false; +bool Session::copy_render_tile_from_device() +{ + return path_trace_->copy_render_tile_from_device(); } -bool Session::update_progressive_refine(bool cancel) +bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels) { - int sample = tile_manager.state.sample + 1; - bool write = sample == tile_manager.num_samples || cancel; + /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification + * is happenning while this function runs. */ - double current_time = time_dt(); + const BufferParams &buffer_params = path_trace_->get_render_tile_params(); - if (current_time - last_update_time_ < params.progressive_update_timeout) { - /* If last sample was processed, we need to write buffers anyway. */ - if (!write && sample != 1) - return false; + const BufferPass *pass = buffer_params.find_pass(pass_name); + if (pass == nullptr) { + return false; } - if (params.progressive_refine) { - foreach (Tile &tile, tile_manager.state.tiles) { - if (!tile.buffers) { - continue; - } - - RenderTile rtile; - rtile.x = tile_manager.state.buffer.full_x + tile.x; - rtile.y = tile_manager.state.buffer.full_y + tile.y; - rtile.w = tile.w; - rtile.h = tile.h; - rtile.sample = sample; - rtile.buffers = tile.buffers; - - if (write) { - if (write_render_tile_cb) - write_render_tile_cb(rtile); - } - else { - if (update_render_tile_cb) - update_render_tile_cb(rtile, true); - } + const bool has_denoised_result = path_trace_->has_denoised_result(); + if (pass->mode == PassMode::DENOISED && !has_denoised_result) { + pass = buffer_params.find_pass(pass->type); + if (pass == nullptr) { + /* Happens when denoised result pass is requested but is never written by the kernel. */ + return false; } } - last_update_time_ = current_time; + pass = buffer_params.get_actual_display_pass(pass); + + const float exposure = buffer_params.exposure; + const int num_samples = path_trace_->get_num_render_tile_samples(); - return write; + PassAccessor::PassAccessInfo pass_access_info(*pass); + pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher; + pass_access_info.use_approximate_shadow_catcher_background = + pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background; + + const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); + const PassAccessor::Destination destination(pixels, num_components); + + return path_trace_->get_render_tile_pixels(pass_accessor, destination); } -void Session::device_free() +bool Session::set_render_tile_pixels(const string &pass_name, + int num_components, + const float *pixels) { - scene->device_free(); + /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification + * is happenning while this function runs. */ + + const BufferPass *pass = buffer_params_.find_pass(pass_name); + if (!pass) { + return false; + } + + const float exposure = scene->film->get_exposure(); + const int num_samples = render_scheduler_.get_num_rendered_samples(); - tile_manager.device_free(); + const PassAccessor::PassAccessInfo pass_access_info(*pass); + PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); + PassAccessor::Source source(pixels, num_components); - /* used from background render only, so no need to - * re-create render/display buffers here - */ + return path_trace_->set_render_tile_pixels(pass_accessor, source); } -void Session::collect_statistics(RenderStats *render_stats) +/* -------------------------------------------------------------------- + * Full-frame on-disk storage. + */ + +void Session::process_full_buffer_from_disk(string_view filename) { - scene->collect_statistics(render_stats); - if (params.use_profiling && (params.device.type == DEVICE_CPU)) { - render_stats->collect_profiling(scene, profiler); - } + path_trace_->process_full_buffer_from_disk(filename); } CCL_NAMESPACE_END diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 05025c10f9c..492cfdd1c09 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -18,6 +18,7 @@ #define __SESSION_H__ #include "device/device.h" +#include "integrator/render_scheduler.h" #include "render/buffers.h" #include "render/shader.h" #include "render/stats.h" @@ -26,6 +27,7 @@ #include "util/util_progress.h" #include "util/util_stats.h" #include "util/util_thread.h" +#include "util/util_unique_ptr.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -33,41 +35,35 @@ CCL_NAMESPACE_BEGIN class BufferParams; class Device; class DeviceScene; -class DeviceRequestedFeatures; -class DisplayBuffer; +class PathTrace; class Progress; +class GPUDisplay; class RenderBuffers; class Scene; +class SceneParams; /* Session Parameters */ class SessionParams { public: DeviceInfo device; + + bool headless; bool background; - bool progressive_refine; - bool progressive; bool experimental; int samples; - int2 tile_size; - TileOrder tile_order; - int start_resolution; - int denoising_start_sample; int pixel_size; int threads; - bool adaptive_sampling; - - bool use_profiling; - bool display_buffer_linear; + /* Limit in seconds for how long path tracing is allowed to happen. + * Zero means no limit is applied. */ + double time_limit; - DenoiseParams denoising; + bool use_profiling; - double cancel_timeout; - double reset_timeout; - double text_timeout; - double progressive_update_timeout; + bool use_auto_tile; + int tile_size; ShadingSystem shadingsystem; @@ -75,50 +71,32 @@ class SessionParams { SessionParams() { + headless = false; background = false; - progressive_refine = false; - progressive = false; experimental = false; samples = 1024; - tile_size = make_int2(64, 64); - start_resolution = INT_MAX; - denoising_start_sample = 0; pixel_size = 1; threads = 0; - adaptive_sampling = false; + time_limit = 0.0; use_profiling = false; - display_buffer_linear = false; - - cancel_timeout = 0.1; - reset_timeout = 0.1; - text_timeout = 1.0; - progressive_update_timeout = 1.0; + use_auto_tile = true; + tile_size = 2048; shadingsystem = SHADINGSYSTEM_SVM; - tile_order = TILE_CENTER; } - bool modified(const SessionParams ¶ms) + bool modified(const SessionParams ¶ms) const { /* Modified means we have to recreate the session, any parameter changes * that can be handled by an existing Session are omitted. */ - return !(device == params.device && background == params.background && - progressive_refine == params.progressive_refine && - progressive == params.progressive && experimental == params.experimental && - tile_size == params.tile_size && start_resolution == params.start_resolution && + return !(device == params.device && headless == params.headless && + background == params.background && experimental == params.experimental && pixel_size == params.pixel_size && threads == params.threads && - adaptive_sampling == params.adaptive_sampling && - use_profiling == params.use_profiling && - display_buffer_linear == params.display_buffer_linear && - cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout && - text_timeout == params.text_timeout && - progressive_update_timeout == params.progressive_update_timeout && - tile_order == params.tile_order && shadingsystem == params.shadingsystem && - denoising.type == params.denoising.type && - (denoising.use == params.denoising.use || (device.denoisers & denoising.type))); + use_profiling == params.use_profiling && shadingsystem == params.shadingsystem && + use_auto_tile == params.use_auto_tile && tile_size == params.tile_size); } }; @@ -131,34 +109,41 @@ class Session { public: Device *device; Scene *scene; - RenderBuffers *buffers; - DisplayBuffer *display; Progress progress; SessionParams params; - TileManager tile_manager; Stats stats; Profiler profiler; - function<void(RenderTile &)> write_render_tile_cb; - function<void(RenderTile &, bool)> update_render_tile_cb; - function<void(RenderTile &)> read_bake_tile_cb; + function<void(void)> write_render_tile_cb; + function<void(void)> update_render_tile_cb; + function<void(void)> read_render_tile_cb; + + /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after + * writing. Allows an engine integration to keep track of those files without worry about + * transfering the information when it needs to re-create session during rendering. */ + function<void(string_view)> full_buffer_written_cb; - explicit Session(const SessionParams ¶ms); + explicit Session(const SessionParams ¶ms, const SceneParams &scene_params); ~Session(); void start(); - void cancel(); - bool draw(BufferParams ¶ms, DeviceDrawParams &draw_params); + + /* When quick cancel is requested path tracing is cancelles as soon as possible, without waiting + * for the buffer to be uniformly sampled. */ + void cancel(bool quick = false); + + void draw(); void wait(); bool ready_to_reset(); - void reset(BufferParams ¶ms, int samples); + void reset(const SessionParams &session_params, const BufferParams &buffer_params); + void set_pause(bool pause); + void set_samples(int samples); - void set_denoising(const DenoiseParams &denoising); - void set_denoising_start_sample(int sample); + void set_time_limit(double time_limit); - bool update_scene(); + void set_gpu_display(unique_ptr<GPUDisplay> gpu_display); void device_free(); @@ -168,83 +153,95 @@ class Session { void collect_statistics(RenderStats *stats); - protected: - struct DelayedReset { - thread_mutex mutex; - bool do_reset; - BufferParams params; - int samples; - } delayed_reset_; + /* -------------------------------------------------------------------- + * Tile and tile pixels aceess. + */ - void run(); + bool has_multiple_render_tiles() const; - bool run_update_for_next_iteration(); - bool run_wait_for_work(bool no_tiles); + /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */ + int2 get_render_tile_size() const; + int2 get_render_tile_offset() const; - void update_status_time(bool show_pause = false, bool show_done = false); + string_view get_render_tile_layer() const; + string_view get_render_tile_view() const; - void render(bool use_denoise); - void copy_to_display_buffer(int sample); + bool copy_render_tile_from_device(); - void reset_(BufferParams ¶ms, int samples); + bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels); + bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels); - void run_cpu(); - bool draw_cpu(BufferParams ¶ms, DeviceDrawParams &draw_params); - void reset_cpu(BufferParams ¶ms, int samples); + /* -------------------------------------------------------------------- + * Full-frame on-disk storage. + */ - void run_gpu(); - bool draw_gpu(BufferParams ¶ms, DeviceDrawParams &draw_params); - void reset_gpu(BufferParams ¶ms, int samples); + /* Read given full-frame file from disk, perform needed processing and write it to the software + * via the write callback. */ + void process_full_buffer_from_disk(string_view filename); - bool render_need_denoise(bool &delayed); + protected: + struct DelayedReset { + thread_mutex mutex; + bool do_reset; + SessionParams session_params; + BufferParams buffer_params; + } delayed_reset_; - bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock); - bool get_tile_stolen(); - bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types); - void update_tile_sample(RenderTile &tile); - void release_tile(RenderTile &tile, const bool need_denoise); + void run(); - void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device); - void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device); + /* Update for the new iteration of the main loop in run implementation (run_cpu and run_gpu). + * + * Will take care of the following things: + * - Delayed reset + * - Scene update + * - Tile manager advance + * - Render scheduler work request + * + * The updates are done in a proper order with proper locking around them, which guarantees + * that the device side of scene and render buffers are always in a consistent state. + * + * Returns render work which is to be rendered next. */ + RenderWork run_update_for_next_iteration(); + + /* Wait for rendering to be unpaused, or for new tiles for render to arrive. + * Returns true if new main render loop iteration is required after this function call. + * + * The `render_work` is the work which was scheduled by the render scheduler right before + * checking the pause. */ + bool run_wait_for_work(const RenderWork &render_work); + + void run_main_render_loop(); + + bool update_scene(int width, int height); - bool device_use_gl_; + void update_status_time(bool show_pause = false, bool show_done = false); - thread *session_thread_; + void do_delayed_reset(); - volatile bool display_outdated_; + int2 get_effective_tile_size() const; - volatile bool gpu_draw_ready_; - volatile bool gpu_need_display_buffer_update_; - thread_condition_variable gpu_need_display_buffer_update_cond_; + thread *session_thread_; - bool pause_; - bool cancel_; - bool new_work_added_; + bool pause_ = false; + bool cancel_ = false; + bool new_work_added_ = false; thread_condition_variable pause_cond_; thread_mutex pause_mutex_; thread_mutex tile_mutex_; thread_mutex buffers_mutex_; - thread_mutex display_mutex_; - thread_condition_variable denoising_cond_; - thread_condition_variable tile_steal_cond_; - - double reset_time_; - double last_update_time_; - double last_display_time_; - - RenderTile stolen_tile_; - typedef enum { - NOT_STEALING, /* There currently is no tile stealing in progress. */ - WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */ - RELEASING_TILE, /* A device has releasing a stealable tile. */ - GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */ - } TileStealingState; - std::atomic<TileStealingState> tile_stealing_state_; - int stealable_tiles_; - - /* progressive refine */ - bool update_progressive_refine(bool cancel); + + TileManager tile_manager_; + BufferParams buffer_params_; + + /* Render scheduler is used to get work to be rendered with the current big tile. */ + RenderScheduler render_scheduler_; + + /* Path tracer object. + * + * Is a single full-frame path tracer for interactive viewport rendering. + * A path tracer for the current big-tile for an offline rendering. */ + unique_ptr<PathTrace> path_trace_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 59b60904746..f6b23606e58 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -203,6 +203,7 @@ Shader::Shader() : Node(get_node_type()) has_surface = false; has_surface_transparent = false; has_surface_emission = false; + has_surface_raytrace = false; has_surface_bssrdf = false; has_volume = false; has_displacement = false; @@ -485,7 +486,7 @@ void ShaderManager::device_update(Device *device, device_update_specific(device, dscene, scene, progress); } -void ShaderManager::device_update_common(Device *device, +void ShaderManager::device_update_common(Device * /*device*/, DeviceScene *dscene, Scene *scene, Progress & /*progress*/) @@ -508,6 +509,8 @@ void ShaderManager::device_update_common(Device *device, flag |= SD_HAS_EMISSION; if (shader->has_surface_transparent && shader->get_use_transparent_shadow()) flag |= SD_HAS_TRANSPARENT_SHADOW; + if (shader->has_surface_raytrace) + flag |= SD_HAS_RAYTRACE; if (shader->has_volume) { flag |= SD_HAS_VOLUME; has_volumes = true; @@ -528,12 +531,10 @@ void ShaderManager::device_update_common(Device *device, flag |= SD_NEED_VOLUME_ATTRIBUTES; if (shader->has_bssrdf_bump) flag |= SD_HAS_BSSRDF_BUMP; - if (device->info.has_volume_decoupled) { - if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR) - flag |= SD_VOLUME_EQUIANGULAR; - if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE) - flag |= SD_VOLUME_MIS; - } + if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR) + flag |= SD_VOLUME_EQUIANGULAR; + if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE) + flag |= SD_VOLUME_MIS; if (shader->get_volume_interpolation_method() == VOLUME_INTERPOLATION_CUBIC) flag |= SD_VOLUME_CUBIC; if (shader->has_bump) @@ -682,39 +683,35 @@ void ShaderManager::add_default(Scene *scene) } } -void ShaderManager::get_requested_graph_features(ShaderGraph *graph, - DeviceRequestedFeatures *requested_features) +uint ShaderManager::get_graph_kernel_features(ShaderGraph *graph) { + uint kernel_features = 0; + foreach (ShaderNode *node, graph->nodes) { - requested_features->max_nodes_group = max(requested_features->max_nodes_group, - node->get_group()); - requested_features->nodes_features |= node->get_feature(); + kernel_features |= node->get_feature(); if (node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) { BsdfBaseNode *bsdf_node = static_cast<BsdfBaseNode *>(node); if (CLOSURE_IS_VOLUME(bsdf_node->get_closure_type())) { - requested_features->nodes_features |= NODE_FEATURE_VOLUME; + kernel_features |= KERNEL_FEATURE_NODE_VOLUME; } else if (CLOSURE_IS_PRINCIPLED(bsdf_node->get_closure_type())) { - requested_features->use_principled = true; + kernel_features |= KERNEL_FEATURE_PRINCIPLED; } } if (node->has_surface_bssrdf()) { - requested_features->use_subsurface = true; + kernel_features |= KERNEL_FEATURE_SUBSURFACE; } if (node->has_surface_transparent()) { - requested_features->use_transparent = true; - } - if (node->has_raytrace()) { - requested_features->use_shader_raytrace = true; + kernel_features |= KERNEL_FEATURE_TRANSPARENT; } } + + return kernel_features; } -void ShaderManager::get_requested_features(Scene *scene, - DeviceRequestedFeatures *requested_features) +uint ShaderManager::get_kernel_features(Scene *scene) { - requested_features->max_nodes_group = NODE_GROUP_LEVEL_0; - requested_features->nodes_features = 0; + uint kernel_features = KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION; for (int i = 0; i < scene->shaders.size(); i++) { Shader *shader = scene->shaders[i]; if (!shader->reference_count()) { @@ -722,21 +719,22 @@ void ShaderManager::get_requested_features(Scene *scene, } /* Gather requested features from all the nodes from the graph nodes. */ - get_requested_graph_features(shader->graph, requested_features); + kernel_features |= get_graph_kernel_features(shader->graph); ShaderNode *output_node = shader->graph->output(); if (output_node->input("Displacement")->link != NULL) { - requested_features->nodes_features |= NODE_FEATURE_BUMP; + kernel_features |= KERNEL_FEATURE_NODE_BUMP; if (shader->get_displacement_method() == DISPLACE_BOTH) { - requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE; - requested_features->max_nodes_group = max(requested_features->max_nodes_group, - NODE_GROUP_LEVEL_1); + kernel_features |= KERNEL_FEATURE_NODE_BUMP_STATE; } } /* On top of volume nodes, also check if we need volume sampling because - * e.g. an Emission node would slip through the NODE_FEATURE_VOLUME check */ - if (shader->has_volume) - requested_features->use_volume |= true; + * e.g. an Emission node would slip through the KERNEL_FEATURE_NODE_VOLUME check */ + if (shader->has_volume) { + kernel_features |= KERNEL_FEATURE_VOLUME; + } } + + return kernel_features; } void ShaderManager::free_memory() diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index c65cac351a4..5f9adea3949 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN class Device; class DeviceScene; -class DeviceRequestedFeatures; class Mesh; class Progress; class Scene; @@ -117,6 +116,7 @@ class Shader : public Node { bool has_surface; bool has_surface_emission; bool has_surface_transparent; + bool has_surface_raytrace; bool has_volume; bool has_displacement; bool has_surface_bssrdf; @@ -216,7 +216,7 @@ class ShaderManager { static void add_default(Scene *scene); /* Selective nodes compilation. */ - void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features); + uint get_kernel_features(Scene *scene); static void free_memory(); @@ -244,8 +244,7 @@ class ShaderManager { size_t beckmann_table_offset; - void get_requested_graph_features(ShaderGraph *graph, - DeviceRequestedFeatures *requested_features); + uint get_graph_kernel_features(ShaderGraph *graph); thread_spin_lock attribute_lock_; diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp index 2c6273842e2..73eb7e21ff9 100644 --- a/intern/cycles/render/stats.cpp +++ b/intern/cycles/render/stats.cpp @@ -264,53 +264,34 @@ void RenderStats::collect_profiling(Scene *scene, Profiler &prof) has_profiling = true; kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN)); - kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP)); - kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT)); - - NamedNestedSampleStats &integrator = kernel.add_entry("Path integration", - prof.get_event(PROFILING_PATH_INTEGRATE)); - integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT)); - integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION)); - integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME)); - - NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0); - shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP)); - shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL)); - shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY)); - shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO)); - shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE)); - - integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT)); - integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE)); - - NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0); - intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT)); - intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL)); - intersection.add_entry("Shadow All Intersection", - prof.get_event(PROFILING_INTERSECT_SHADOW_ALL)); - intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME)); - intersection.add_entry("Volume All Intersection", - prof.get_event(PROFILING_INTERSECT_VOLUME_ALL)); - - NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0); - closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL)); - closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE)); - closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL)); - closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE)); - - NamedNestedSampleStats &denoising = kernel.add_entry("Denoising", - prof.get_event(PROFILING_DENOISING)); - denoising.add_entry("Construct Transform", - prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM)); - denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT)); - - NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0); - prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW)); - prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS)); - prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE)); - prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS)); - prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES)); + kernel.add_entry("Intersect Closest", prof.get_event(PROFILING_INTERSECT_CLOSEST)); + kernel.add_entry("Intersect Shadow", prof.get_event(PROFILING_INTERSECT_SHADOW)); + kernel.add_entry("Intersect Subsurface", prof.get_event(PROFILING_INTERSECT_SUBSURFACE)); + kernel.add_entry("Intersect Volume Stack", prof.get_event(PROFILING_INTERSECT_VOLUME_STACK)); + + NamedNestedSampleStats &surface = kernel.add_entry("Shade Surface", 0); + surface.add_entry("Setup", prof.get_event(PROFILING_SHADE_SURFACE_SETUP)); + surface.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_SURFACE_EVAL)); + surface.add_entry("Render Passes", prof.get_event(PROFILING_SHADE_SURFACE_PASSES)); + surface.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_SURFACE_DIRECT_LIGHT)); + surface.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT)); + surface.add_entry("Ambient Occlusion", prof.get_event(PROFILING_SHADE_SURFACE_AO)); + + NamedNestedSampleStats &volume = kernel.add_entry("Shade Volume", 0); + volume.add_entry("Setup", prof.get_event(PROFILING_SHADE_VOLUME_SETUP)); + volume.add_entry("Integrate", prof.get_event(PROFILING_SHADE_VOLUME_INTEGRATE)); + volume.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_VOLUME_DIRECT_LIGHT)); + volume.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_VOLUME_INDIRECT_LIGHT)); + + NamedNestedSampleStats &shadow = kernel.add_entry("Shade Shadow", 0); + shadow.add_entry("Setup", prof.get_event(PROFILING_SHADE_SHADOW_SETUP)); + shadow.add_entry("Surface", prof.get_event(PROFILING_SHADE_SHADOW_SURFACE)); + shadow.add_entry("Volume", prof.get_event(PROFILING_SHADE_SHADOW_VOLUME)); + + NamedNestedSampleStats &light = kernel.add_entry("Shade Light", 0); + light.add_entry("Setup", prof.get_event(PROFILING_SHADE_LIGHT_SETUP)); + light.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_LIGHT_EVAL)); shaders.entries.clear(); foreach (Shader *shader, scene->shaders) { diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index dcb3976e15c..2379eb775a0 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -446,6 +446,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done) if (current_type == SHADER_TYPE_SURFACE) { if (node->has_spatial_varying()) current_shader->has_surface_spatial_varying = true; + if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE) + current_shader->has_surface_raytrace = true; } else if (current_type == SHADER_TYPE_VOLUME) { if (node->has_spatial_varying()) @@ -492,6 +494,13 @@ void SVMCompiler::generate_svm_nodes(const ShaderNodeSet &nodes, CompilerState * void SVMCompiler::generate_closure_node(ShaderNode *node, CompilerState *state) { + /* Skip generating closure that are not supported or needed for a particular + * type of shader. For example a BSDF in a volume shader. */ + const int node_feature = node->get_feature(); + if ((state->node_feature_mask & node_feature) != node_feature) { + return; + } + /* execute dependencies for closure */ foreach (ShaderInput *in, node->inputs) { if (in->link != NULL) { @@ -555,7 +564,7 @@ void SVMCompiler::find_aov_nodes_and_dependencies(ShaderNodeSet &aov_nodes, foreach (ShaderNode *node, graph->nodes) { if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) { OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node); - if (aov_node->slot >= 0) { + if (aov_node->offset >= 0) { aov_nodes.insert(aov_node); foreach (ShaderInput *in, node->inputs) { if (in->link != NULL) { @@ -785,17 +794,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty case SHADER_TYPE_SURFACE: /* generate surface shader */ generate = true; shader->has_surface = true; + state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE; break; case SHADER_TYPE_VOLUME: /* generate volume shader */ generate = true; shader->has_volume = true; + state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_VOLUME; break; case SHADER_TYPE_DISPLACEMENT: /* generate displacement shader */ generate = true; shader->has_displacement = true; + state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_DISPLACEMENT; break; case SHADER_TYPE_BUMP: /* generate bump shader */ generate = true; + state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_BUMP; break; default: break; @@ -867,6 +880,7 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum shader->has_surface = false; shader->has_surface_emission = false; shader->has_surface_transparent = false; + shader->has_surface_raytrace = false; shader->has_surface_bssrdf = false; shader->has_bump = has_bump; shader->has_bssrdf_bump = has_bump; @@ -964,6 +978,7 @@ SVMCompiler::CompilerState::CompilerState(ShaderGraph *graph) max_id = max(node->id, max_id); } nodes_done_flag.resize(max_id + 1, false); + node_feature_mask = 0; } CCL_NAMESPACE_END diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index d23ff3e2a47..0353c393ae4 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -192,6 +192,9 @@ class SVMCompiler { * all areas to use this flags array. */ vector<bool> nodes_done_flag; + + /* Node features that can be compiled. */ + uint node_feature_mask; }; void stack_clear_temporary(ShaderNode *node); diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 375c9fd8e09..eed75cc2372 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -16,601 +16,559 @@ #include "render/tile.h" +#include <atomic> + +#include "graph/node.h" +#include "render/background.h" +#include "render/film.h" +#include "render/integrator.h" +#include "render/scene.h" #include "util/util_algorithm.h" #include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_system.h" #include "util/util_types.h" CCL_NAMESPACE_BEGIN -namespace { +/* -------------------------------------------------------------------- + * Internal functions. + */ -class TileComparator { - public: - TileComparator(TileOrder order_, int2 center_, Tile *tiles_) - : order(order_), center(center_), tiles(tiles_) - { - } +static const char *ATTR_PASSES_COUNT = "cycles.passes.count"; +static const char *ATTR_PASS_SOCKET_PREFIX_FORMAT = "cycles.passes.%d."; +static const char *ATTR_BUFFER_SOCKET_PREFIX = "cycles.buffer."; +static const char *ATTR_DENOISE_SOCKET_PREFIX = "cycles.denoise."; - bool operator()(int a, int b) - { - switch (order) { - case TILE_CENTER: { - float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w / 2), - center.y - (tiles[a].y + tiles[a].h / 2)); - float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w / 2), - center.y - (tiles[b].y + tiles[b].h / 2)); - return dot(dist_a, dist_a) < dot(dist_b, dist_b); - } - case TILE_LEFT_TO_RIGHT: - return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x < tiles[b].x); - case TILE_RIGHT_TO_LEFT: - return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x > tiles[b].x); - case TILE_TOP_TO_BOTTOM: - return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y > tiles[b].y); - case TILE_BOTTOM_TO_TOP: - default: - return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y < tiles[b].y); +/* Global counter of ToleManager object instances. */ +static std::atomic<uint64_t> g_instance_index = 0; + +/* Construct names of EXR channels which will ensure order of all channels to match exact offsets + * in render buffers corresponding to the given passes. + * + * Returns `std` datatypes so that it can be assigned directly to the OIIO's `ImageSpec`. */ +static std::vector<std::string> exr_channel_names_for_passes(const BufferParams &buffer_params) +{ + static const char *component_suffixes[] = {"R", "G", "B", "A"}; + + int pass_index = 0; + int num_channels = 0; + std::vector<std::string> channel_names; + for (const BufferPass &pass : buffer_params.passes) { + if (pass.offset == PASS_UNUSED) { + continue; } - } - protected: - TileOrder order; - int2 center; - Tile *tiles; -}; + const PassInfo pass_info = pass.get_info(); + num_channels += pass_info.num_components; -inline int2 hilbert_index_to_pos(int n, int d) -{ - int2 r, xy = make_int2(0, 0); - for (int s = 1; s < n; s *= 2) { - r.x = (d >> 1) & 1; - r.y = (d ^ r.x) & 1; - if (!r.y) { - if (r.x) { - xy = make_int2(s - 1, s - 1) - xy; - } - swap(xy.x, xy.y); + /* EXR canonically expects first part of channel names to be sorted alphabetically, which is + * not guaranteed to be the case with passes names. Assign a prefix based on the pass index + * with a fixed width to ensure ordering. This makes it possible to dump existing render + * buffers memory to disk and read it back without doing extra mapping. */ + const string prefix = string_printf("%08d", pass_index); + + const string channel_name_prefix = prefix + string(pass.name) + "."; + + for (int i = 0; i < pass_info.num_components; ++i) { + channel_names.push_back(channel_name_prefix + component_suffixes[i]); } - xy += r * make_int2(s, s); - d >>= 2; + + ++pass_index; } - return xy; + + return channel_names; } -enum SpiralDirection { - DIRECTION_UP, - DIRECTION_LEFT, - DIRECTION_DOWN, - DIRECTION_RIGHT, -}; - -} /* namespace */ - -TileManager::TileManager(bool progressive_, - int num_samples_, - int2 tile_size_, - int start_resolution_, - bool preserve_tile_device_, - bool background_, - TileOrder tile_order_, - int num_devices_, - int pixel_size_) +inline string node_socket_attribute_name(const SocketType &socket, const string &attr_name_prefix) { - progressive = progressive_; - tile_size = tile_size_; - tile_order = tile_order_; - start_resolution = start_resolution_; - pixel_size = pixel_size_; - slice_overlap = 0; - num_samples = num_samples_; - num_devices = num_devices_; - preserve_tile_device = preserve_tile_device_; - background = background_; - schedule_denoising = false; - - range_start_sample = 0; - range_num_samples = -1; - - BufferParams buffer_params; - reset(buffer_params, 0); + return attr_name_prefix + string(socket.name); } -TileManager::~TileManager() +template<typename ValidateValueFunc, typename GetValueFunc> +static bool node_socket_generic_to_image_spec_atttributes( + ImageSpec *image_spec, + const Node *node, + const SocketType &socket, + const string &attr_name_prefix, + const ValidateValueFunc &validate_value_func, + const GetValueFunc &get_value_func) { + if (!validate_value_func(node, socket)) { + return false; + } + + image_spec->attribute(node_socket_attribute_name(socket, attr_name_prefix), + get_value_func(node, socket)); + + return true; } -void TileManager::device_free() +static bool node_socket_to_image_spec_atttributes(ImageSpec *image_spec, + const Node *node, + const SocketType &socket, + const string &attr_name_prefix) { - if (schedule_denoising || progressive) { - for (int i = 0; i < state.tiles.size(); i++) { - delete state.tiles[i].buffers; - state.tiles[i].buffers = NULL; + const string attr_name = node_socket_attribute_name(socket, attr_name_prefix); + + switch (socket.type) { + case SocketType::ENUM: { + const ustring value = node->get_string(socket); + + /* Validate that the node is consistent with the node type definition. */ + const NodeEnum &enum_values = *socket.enum_values; + if (!enum_values.exists(value)) { + LOG(DFATAL) << "Node enum contains invalid value " << value; + return false; + } + + image_spec->attribute(attr_name, value); + + return true; } - } - state.tiles.clear(); + case SocketType::STRING: + image_spec->attribute(attr_name, node->get_string(socket)); + return true; + + case SocketType::INT: + image_spec->attribute(attr_name, node->get_int(socket)); + return true; + + case SocketType::FLOAT: + image_spec->attribute(attr_name, node->get_float(socket)); + return true; + + case SocketType::BOOLEAN: + image_spec->attribute(attr_name, node->get_bool(socket)); + return true; + + default: + LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen."; + return false; + } } -static int get_divider(int w, int h, int start_resolution) +static bool node_socket_from_image_spec_atttributes(Node *node, + const SocketType &socket, + const ImageSpec &image_spec, + const string &attr_name_prefix) { - int divider = 1; - if (start_resolution != INT_MAX) { - while (w * h > start_resolution * start_resolution) { - w = max(1, w / 2); - h = max(1, h / 2); + const string attr_name = node_socket_attribute_name(socket, attr_name_prefix); + + switch (socket.type) { + case SocketType::ENUM: { + /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */ + const ustring value(image_spec.get_string_attribute(attr_name, "")); + + /* Validate that the node is consistent with the node type definition. */ + const NodeEnum &enum_values = *socket.enum_values; + if (!enum_values.exists(value)) { + LOG(ERROR) << "Invalid enumerator value " << value; + return false; + } - divider <<= 1; + node->set(socket, enum_values[value]); + + return true; } + + case SocketType::STRING: + /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */ + node->set(socket, ustring(image_spec.get_string_attribute(attr_name, ""))); + return true; + + case SocketType::INT: + node->set(socket, image_spec.get_int_attribute(attr_name, 0)); + return true; + + case SocketType::FLOAT: + node->set(socket, image_spec.get_float_attribute(attr_name, 0)); + return true; + + case SocketType::BOOLEAN: + node->set(socket, static_cast<bool>(image_spec.get_int_attribute(attr_name, 0))); + return true; + + default: + LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen."; + return false; } - return divider; } -void TileManager::reset(BufferParams ¶ms_, int num_samples_) +static bool node_to_image_spec_atttributes(ImageSpec *image_spec, + const Node *node, + const string &attr_name_prefix) { - params = params_; - - set_samples(num_samples_); - - state.buffer = BufferParams(); - state.sample = range_start_sample - 1; - state.num_tiles = 0; - state.num_samples = 0; - state.resolution_divider = get_divider(params.width, params.height, start_resolution); - state.render_tiles.clear(); - state.denoising_tiles.clear(); - device_free(); + for (const SocketType &socket : node->type->inputs) { + if (!node_socket_to_image_spec_atttributes(image_spec, node, socket, attr_name_prefix)) { + return false; + } + } + + return true; } -void TileManager::set_samples(int num_samples_) +static bool node_from_image_spec_atttributes(Node *node, + const ImageSpec &image_spec, + const string &attr_name_prefix) { - num_samples = num_samples_; + for (const SocketType &socket : node->type->inputs) { + if (!node_socket_from_image_spec_atttributes(node, socket, image_spec, attr_name_prefix)) { + return false; + } + } + + return true; +} - /* No real progress indication is possible when using unlimited samples. */ - if (num_samples == INT_MAX) { - state.total_pixel_samples = 0; +static bool buffer_params_to_image_spec_atttributes(ImageSpec *image_spec, + const BufferParams &buffer_params) +{ + if (!node_to_image_spec_atttributes(image_spec, &buffer_params, ATTR_BUFFER_SOCKET_PREFIX)) { + return false; } - else { - uint64_t pixel_samples = 0; - /* While rendering in the viewport, the initial preview resolution is increased to the native - * resolution before the actual rendering begins. Therefore, additional pixel samples will be - * rendered. */ - int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size); - while (divider > pixel_size) { - int image_w = max(1, params.width / divider); - int image_h = max(1, params.height / divider); - pixel_samples += image_w * image_h; - divider >>= 1; - } - int image_w = max(1, params.width / divider); - int image_h = max(1, params.height / divider); - state.total_pixel_samples = pixel_samples + - (uint64_t)get_num_effective_samples() * image_w * image_h; - if (schedule_denoising) { - state.total_pixel_samples += params.width * params.height; + /* Passes storage is not covered by the node socket. so "expand" the loop manually. */ + + const int num_passes = buffer_params.passes.size(); + image_spec->attribute(ATTR_PASSES_COUNT, num_passes); + + for (int pass_index = 0; pass_index < num_passes; ++pass_index) { + const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index); + + const BufferPass *pass = &buffer_params.passes[pass_index]; + if (!node_to_image_spec_atttributes(image_spec, pass, attr_name_prefix)) { + return false; } } + + return true; } -/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render - * device. If sliced is true, slice image into as much pieces as how many devices are rendering - * this image. */ -int TileManager::gen_tiles(bool sliced) +static bool buffer_params_from_image_spec_atttributes(BufferParams *buffer_params, + const ImageSpec &image_spec) { - int resolution = state.resolution_divider; - int image_w = max(1, params.width / resolution); - int image_h = max(1, params.height / resolution); - int2 center = make_int2(image_w / 2, image_h / 2); - - int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1; - int slice_num = sliced ? num : 1; - int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x); - - device_free(); - state.render_tiles.clear(); - state.denoising_tiles.clear(); - state.render_tiles.resize(num); - state.denoising_tiles.resize(num); - state.tile_stride = tile_w; - vector<list<int>>::iterator tile_list; - tile_list = state.render_tiles.begin(); - - if (tile_order == TILE_HILBERT_SPIRAL) { - assert(!sliced && slice_overlap == 0); - - int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y); - state.tiles.resize(tile_w * tile_h); - - /* Size of blocks in tiles, must be a power of 2 */ - const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12) ? 8 : 4; - - int tiles_per_device = divide_up(tile_w * tile_h, num); - int cur_device = 0, cur_tiles = 0; - - int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size); - /* Number of blocks to fill the image */ - int blocks_x = (block_size.x >= image_w) ? 1 : divide_up(image_w, block_size.x); - int blocks_y = (block_size.y >= image_h) ? 1 : divide_up(image_h, block_size.y); - int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */ - /* Offset of spiral (to keep it centered) */ - int2 offset = make_int2((image_w - n * block_size.x) / 2, (image_h - n * block_size.y) / 2); - offset = (offset / tile_size) * tile_size; /* Round to tile border. */ - - int2 block = make_int2(0, 0); /* Current block */ - SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP; - for (int i = 0;;) { - /* Generate the tiles in the current block. */ - for (int hilbert_index = 0; hilbert_index < hilbert_size * hilbert_size; hilbert_index++) { - int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index); - /* Rotate block according to spiral direction. */ - if (prev_dir == DIRECTION_UP && dir == DIRECTION_UP) { - tile = make_int2(hilbert_pos.y, hilbert_pos.x); - } - else if (dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) { - tile = hilbert_pos; - } - else if (dir == DIRECTION_DOWN) { - tile = make_int2(hilbert_size - 1 - hilbert_pos.y, hilbert_size - 1 - hilbert_pos.x); - } - else { - tile = make_int2(hilbert_size - 1 - hilbert_pos.x, hilbert_size - 1 - hilbert_pos.y); - } - - int2 pos = block * block_size + tile * tile_size + offset; - /* Only add tiles which are in the image (tiles outside of the image can be generated since - * the spiral is always square). */ - if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) { - int w = min(tile_size.x, image_w - pos.x); - int h = min(tile_size.y, image_h - pos.y); - int2 ipos = pos / tile_size; - int idx = ipos.y * tile_w + ipos.x; - state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER); - tile_list->push_front(idx); - cur_tiles++; - - if (cur_tiles == tiles_per_device) { - tile_list++; - cur_tiles = 0; - cur_device++; - } - } - } + if (!node_from_image_spec_atttributes(buffer_params, image_spec, ATTR_BUFFER_SOCKET_PREFIX)) { + return false; + } - /* Stop as soon as the spiral has reached the center block. */ - if (block.x == (n - 1) / 2 && block.y == (n - 1) / 2) - break; - - /* Advance to next block. */ - prev_dir = dir; - switch (dir) { - case DIRECTION_UP: - block.y++; - if (block.y == (n - i - 1)) { - dir = DIRECTION_LEFT; - } - break; - case DIRECTION_LEFT: - block.x++; - if (block.x == (n - i - 1)) { - dir = DIRECTION_DOWN; - } - break; - case DIRECTION_DOWN: - block.y--; - if (block.y == i) { - dir = DIRECTION_RIGHT; - } - break; - case DIRECTION_RIGHT: - block.x--; - if (block.x == i + 1) { - dir = DIRECTION_UP; - i++; - } - break; - } - } - return tile_w * tile_h; + /* Passes storage is not covered by the node socket. so "expand" the loop manually. */ + + const int num_passes = image_spec.get_int_attribute(ATTR_PASSES_COUNT, 0); + if (num_passes == 0) { + LOG(ERROR) << "Missing passes count attribute."; + return false; } - int idx = 0; - for (int slice = 0; slice < slice_num; slice++) { - int slice_y = (image_h / slice_num) * slice; - int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) : - image_h / slice_num; + for (int pass_index = 0; pass_index < num_passes; ++pass_index) { + const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index); - if (slice_overlap != 0) { - int slice_y_offset = max(slice_y - slice_overlap, 0); - slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset; - slice_y = slice_y_offset; - } + BufferPass pass; - int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y); - - int tiles_per_device = divide_up(tile_w * tile_h, num); - int cur_device = 0, cur_tiles = 0; - - for (int tile_y = 0; tile_y < tile_h; tile_y++) { - for (int tile_x = 0; tile_x < tile_w; tile_x++, idx++) { - int x = tile_x * tile_size.x; - int y = tile_y * tile_size.y; - int w = (tile_x == tile_w - 1) ? image_w - x : tile_size.x; - int h = (tile_y == tile_h - 1) ? slice_h - y : tile_size.y; - - state.tiles.push_back( - Tile(idx, x, y + slice_y, w, h, sliced ? slice : cur_device, Tile::RENDER)); - tile_list->push_back(idx); - - if (!sliced) { - cur_tiles++; - - if (cur_tiles == tiles_per_device) { - /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that - * case. */ - if (tile_order != TILE_BOTTOM_TO_TOP) { - tile_list->sort(TileComparator(tile_order, center, &state.tiles[0])); - } - tile_list++; - cur_tiles = 0; - cur_device++; - } - } - } - } - if (sliced) { - tile_list++; + if (!node_from_image_spec_atttributes(&pass, image_spec, attr_name_prefix)) { + return false; } + + buffer_params->passes.emplace_back(std::move(pass)); } - return idx; + buffer_params->update_passes(); + + return true; } -void TileManager::gen_render_tiles() +/* Configure image specification for the given buffer parameters and passes. + * + * Image channels will ber strictly ordered to match content of corresponding buffer, and the + * metadata will be set so that the render buffers and passes can be reconstructed from it. + * + * If the tile size different from (0, 0) the image specification will be configured to use the + * given tile size for tiled IO. */ +static bool configure_image_spec_from_buffer(ImageSpec *image_spec, + const BufferParams &buffer_params, + const int2 tile_size = make_int2(0, 0)) { - /* Regenerate just the render tiles for progressive render. */ - foreach (Tile &tile, state.tiles) { - tile.state = Tile::RENDER; - state.render_tiles[tile.device].push_back(tile.index); + const std::vector<std::string> channel_names = exr_channel_names_for_passes(buffer_params); + const int num_channels = channel_names.size(); + + *image_spec = ImageSpec( + buffer_params.width, buffer_params.height, num_channels, TypeDesc::FLOAT); + + image_spec->channelnames = move(channel_names); + + if (!buffer_params_to_image_spec_atttributes(image_spec, buffer_params)) { + return false; + } + + if (tile_size.x != 0 || tile_size.y != 0) { + DCHECK_GT(tile_size.x, 0); + DCHECK_GT(tile_size.y, 0); + + image_spec->tile_width = tile_size.x; + image_spec->tile_height = tile_size.y; } + + return true; } -void TileManager::set_tiles() +/* -------------------------------------------------------------------- + * Tile Manager. + */ + +TileManager::TileManager() { - int resolution = state.resolution_divider; - int image_w = max(1, params.width / resolution); - int image_h = max(1, params.height / resolution); + /* Use process ID to separate different processes. + * To ensure uniqueness from within a process use combination of object address and instance + * index. This solves problem of possible object re-allocation at the same time, and solves + * possible conflict when the counter overflows while there are still active instances of the + * class. */ + const int tile_manager_id = g_instance_index.fetch_add(1, std::memory_order_relaxed); + tile_file_unique_part_ = to_string(system_self_process_id()) + "-" + + to_string(reinterpret_cast<uintptr_t>(this)) + "-" + + to_string(tile_manager_id); +} - state.num_tiles = gen_tiles(!background); +TileManager::~TileManager() +{ +} + +void TileManager::reset_scheduling(const BufferParams ¶ms, int2 tile_size) +{ + VLOG(3) << "Using tile size of " << tile_size; + + close_tile_output(); + + tile_size_ = tile_size; + + tile_state_.num_tiles_x = divide_up(params.width, tile_size_.x); + tile_state_.num_tiles_y = divide_up(params.height, tile_size_.y); + tile_state_.num_tiles = tile_state_.num_tiles_x * tile_state_.num_tiles_y; + + tile_state_.next_tile_index = 0; + + tile_state_.current_tile = Tile(); +} + +void TileManager::update(const BufferParams ¶ms, const Scene *scene) +{ + DCHECK_NE(params.pass_stride, -1); + + buffer_params_ = params; - state.buffer.width = image_w; - state.buffer.height = image_h; + /* TODO(sergey): Proper Error handling, so that if configuration has failed we dont' attempt to + * write to a partially configured file. */ + configure_image_spec_from_buffer(&write_state_.image_spec, buffer_params_, tile_size_); - state.buffer.full_x = params.full_x / resolution; - state.buffer.full_y = params.full_y / resolution; - state.buffer.full_width = max(1, params.full_width / resolution); - state.buffer.full_height = max(1, params.full_height / resolution); + const DenoiseParams denoise_params = scene->integrator->get_denoise_params(); + node_to_image_spec_atttributes( + &write_state_.image_spec, &denoise_params, ATTR_DENOISE_SOCKET_PREFIX); } -int TileManager::get_neighbor_index(int index, int neighbor) +bool TileManager::done() { - /* Neighbor indices: - * 0 1 2 - * 3 4 5 - * 6 7 8 - */ - static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1}; - static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1}; - - int resolution = state.resolution_divider; - int image_w = max(1, params.width / resolution); - int image_h = max(1, params.height / resolution); - - int num = min(image_h, num_devices); - int slice_num = !background ? num : 1; - int slice_h = image_h / slice_num; - - int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x); - int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y); - - /* Tiles in the state tile list are always indexed from left to right, top to bottom. */ - int nx = (index % tile_w) + dx[neighbor]; - int ny = (index / tile_w) + dy[neighbor]; - if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num) - return -1; - - return ny * state.tile_stride + nx; + return tile_state_.next_tile_index == tile_state_.num_tiles; } -/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state - * min_state. */ -bool TileManager::check_neighbor_state(int index, Tile::State min_state) +bool TileManager::next() { - if (index < 0 || state.tiles[index].state < min_state) { + if (done()) { return false; } - for (int neighbor = 0; neighbor < 9; neighbor++) { - int nindex = get_neighbor_index(index, neighbor); - /* Out-of-bounds tiles don't matter. */ - if (nindex >= 0 && state.tiles[nindex].state < min_state) { - return false; - } - } + + tile_state_.current_tile = get_tile_for_index(tile_state_.next_tile_index); + + ++tile_state_.next_tile_index; return true; } -/* Returns whether the tile should be written (and freed if no denoising is used) instead of - * updating. */ -bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile) +Tile TileManager::get_tile_for_index(int index) const { - delete_tile = false; - - switch (state.tiles[index].state) { - case Tile::RENDER: { - if (!(schedule_denoising && need_denoise)) { - state.tiles[index].state = Tile::DONE; - delete_tile = !progressive; - return true; - } - state.tiles[index].state = Tile::RENDERED; - /* For each neighbor and the tile itself, check whether all of its neighbors have been - * rendered. If yes, it can be denoised. */ - for (int neighbor = 0; neighbor < 9; neighbor++) { - int nindex = get_neighbor_index(index, neighbor); - if (check_neighbor_state(nindex, Tile::RENDERED)) { - state.tiles[nindex].state = Tile::DENOISE; - state.denoising_tiles[state.tiles[nindex].device].push_back(nindex); - } - } - return false; - } - case Tile::DENOISE: { - state.tiles[index].state = Tile::DENOISED; - /* For each neighbor and the tile itself, check whether all of its neighbors have been - * denoised. If yes, it can be freed. */ - for (int neighbor = 0; neighbor < 9; neighbor++) { - int nindex = get_neighbor_index(index, neighbor); - if (check_neighbor_state(nindex, Tile::DENOISED)) { - state.tiles[nindex].state = Tile::DONE; - /* Do not delete finished tiles in progressive mode. */ - if (!progressive) { - /* It can happen that the tile just finished denoising and already can be freed here. - * However, in that case it still has to be written before deleting, so we can't delete - * it yet. */ - if (neighbor == 4) { - delete_tile = true; - } - else { - delete state.tiles[nindex].buffers; - state.tiles[nindex].buffers = NULL; - } - } - } - } - return true; - } - default: - assert(false); - return true; + /* TODO(sergey): Consider using hilbert spiral, or. maybe, even configurable. Not sure this + * brings a lot of value since this is only applicable to BIG tiles. */ + + const int tile_y = index / tile_state_.num_tiles_x; + const int tile_x = index - tile_y * tile_state_.num_tiles_x; + + Tile tile; + + tile.x = tile_x * tile_size_.x; + tile.y = tile_y * tile_size_.y; + tile.width = tile_size_.x; + tile.height = tile_size_.y; + + tile.width = min(tile.width, buffer_params_.width - tile.x); + tile.height = min(tile.height, buffer_params_.height - tile.y); + + return tile; +} + +const Tile &TileManager::get_current_tile() const +{ + return tile_state_.current_tile; +} + +bool TileManager::open_tile_output() +{ + write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" + + to_string(write_state_.tile_file_index) + ".exr"); + + write_state_.tile_out = ImageOutput::create(write_state_.filename); + if (!write_state_.tile_out) { + LOG(ERROR) << "Error creating image output for " << write_state_.filename; + return false; + } + + if (!write_state_.tile_out->supports("tiles")) { + LOG(ERROR) << "Progress tile file format does not support tiling."; + return false; } + + write_state_.tile_out->open(write_state_.filename, write_state_.image_spec); + write_state_.num_tiles_written = 0; + + VLOG(3) << "Opened tile file " << write_state_.filename; + + return true; } -bool TileManager::next_tile(Tile *&tile, int device, uint tile_types) +bool TileManager::close_tile_output() { - /* Preserve device if requested, unless this is a separate denoising device that just wants to - * grab any available tile. */ - const bool preserve_device = preserve_tile_device && device < num_devices; - - if (tile_types & RenderTile::DENOISE) { - int tile_index = -1; - int logical_device = preserve_device ? device : 0; - - while (logical_device < state.denoising_tiles.size()) { - if (state.denoising_tiles[logical_device].empty()) { - if (preserve_device) { - break; - } - else { - logical_device++; - continue; - } - } + if (!write_state_.tile_out) { + return true; + } - tile_index = state.denoising_tiles[logical_device].front(); - state.denoising_tiles[logical_device].pop_front(); - break; - } + const bool success = write_state_.tile_out->close(); + write_state_.tile_out = nullptr; - if (tile_index >= 0) { - tile = &state.tiles[tile_index]; - return true; - } + if (!success) { + LOG(ERROR) << "Error closing tile file."; + return false; } - if (tile_types & RenderTile::PATH_TRACE) { - int tile_index = -1; - int logical_device = preserve_device ? device : 0; - - while (logical_device < state.render_tiles.size()) { - if (state.render_tiles[logical_device].empty()) { - if (preserve_device) { - break; - } - else { - logical_device++; - continue; - } - } + VLOG(3) << "Tile output is closed."; - tile_index = state.render_tiles[logical_device].front(); - state.render_tiles[logical_device].pop_front(); - break; + return true; +} + +bool TileManager::write_tile(const RenderBuffers &tile_buffers) +{ + if (!write_state_.tile_out) { + if (!open_tile_output()) { + return false; } + } - if (tile_index >= 0) { - tile = &state.tiles[tile_index]; - return true; + DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride); + + const BufferParams &tile_params = tile_buffers.params; + + vector<float> pixel_storage; + const float *pixels = tile_buffers.buffer.data(); + + /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with + * empty pixels for tiles which are on the image boundary. */ + if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) { + const int64_t pass_stride = tile_params.pass_stride; + const int64_t src_row_stride = tile_params.width * pass_stride; + + const int64_t dst_row_stride = tile_size_.x * pass_stride; + pixel_storage.resize(dst_row_stride * tile_size_.y); + + const float *src = tile_buffers.buffer.data(); + float *dst = pixel_storage.data(); + pixels = dst; + + for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) { + memcpy(dst, src, src_row_stride * sizeof(float)); } } - return false; -} + const int tile_x = tile_params.full_x - buffer_params_.full_x; + const int tile_y = tile_params.full_y - buffer_params_.full_y; -bool TileManager::done() -{ - int end_sample = (range_num_samples == -1) ? num_samples : - range_start_sample + range_num_samples; - return (state.resolution_divider == pixel_size) && - (state.sample + state.num_samples >= end_sample); + VLOG(3) << "Write tile at " << tile_x << ", " << tile_y; + if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) { + LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror(); + } + + ++write_state_.num_tiles_written; + + return true; } -bool TileManager::has_tiles() +void TileManager::finish_write_tiles() { - foreach (Tile &tile, state.tiles) { - if (tile.state != Tile::DONE) { - return true; + if (!write_state_.tile_out) { + /* None of the tiles were written hence the file was not created. + * Avoid creation of fully empty file since it is redundant. */ + return; + } + + /* EXR expects all tiles to present in file. So explicitly write missing tiles as all-zero. */ + if (write_state_.num_tiles_written < tile_state_.num_tiles) { + vector<float> pixel_storage(tile_size_.x * tile_size_.y * buffer_params_.pass_stride); + + for (int tile_index = write_state_.num_tiles_written; tile_index < tile_state_.num_tiles; + ++tile_index) { + const Tile tile = get_tile_for_index(tile_index); + + VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y; + + write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data()); } } - return false; + + close_tile_output(); + + if (full_buffer_written_cb) { + full_buffer_written_cb(write_state_.filename); + } + + /* Advance the counter upon explicit finish of the file. + * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments + * of the tile-file-within-session index. */ + ++write_state_.tile_file_index; + + write_state_.filename = ""; } -bool TileManager::next() +bool TileManager::read_full_buffer_from_disk(const string_view filename, + RenderBuffers *buffers, + DenoiseParams *denoise_params) { - if (done()) + unique_ptr<ImageInput> in(ImageInput::open(filename)); + if (!in) { + LOG(ERROR) << "Error opening tile file " << filename; return false; + } + + const ImageSpec &image_spec = in->spec(); - if (progressive && state.resolution_divider > pixel_size) { - state.sample = 0; - state.resolution_divider = max(state.resolution_divider / 2, pixel_size); - state.num_samples = 1; - set_tiles(); + BufferParams buffer_params; + if (!buffer_params_from_image_spec_atttributes(&buffer_params, image_spec)) { + return false; } - else { - state.sample++; + buffers->reset(buffer_params); - if (progressive) - state.num_samples = 1; - else if (range_num_samples == -1) - state.num_samples = num_samples; - else - state.num_samples = range_num_samples; + if (!node_from_image_spec_atttributes(denoise_params, image_spec, ATTR_DENOISE_SOCKET_PREFIX)) { + return false; + } - state.resolution_divider = pixel_size; + if (!in->read_image(TypeDesc::FLOAT, buffers->buffer.data())) { + LOG(ERROR) << "Error reading pixels from the tile file " << in->geterror(); + return false; + } - if (state.sample == range_start_sample) { - set_tiles(); - } - else { - gen_render_tiles(); - } + if (!in->close()) { + LOG(ERROR) << "Error closing tile file " << in->geterror(); + return false; } return true; } -int TileManager::get_num_effective_samples() -{ - return (range_num_samples == -1) ? num_samples : range_num_samples; -} - CCL_NAMESPACE_END diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 790a56f9445..124d0b3652c 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -14,159 +14,151 @@ * limitations under the License. */ -#ifndef __TILE_H__ -#define __TILE_H__ - -#include <limits.h> +#pragma once #include "render/buffers.h" -#include "util/util_list.h" +#include "util/util_image.h" +#include "util/util_string.h" +#include "util/util_unique_ptr.h" CCL_NAMESPACE_BEGIN -/* Tile */ +class DenoiseParams; +class Scene; + +/* -------------------------------------------------------------------- + * Tile. + */ class Tile { public: - int index; - int x, y, w, h; - int device; - /* RENDER: The tile has to be rendered. - * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors). - * DENOISE: The tile can be denoised now. - * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors). - * DONE: The tile is finished and has been freed. */ - typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State; - State state; - RenderBuffers *buffers; + int x = 0, y = 0; + int width = 0, height = 0; Tile() { } - - Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER) - : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) - { - } }; -/* Tile order */ - -/* Note: this should match enum_tile_order in properties.py */ -enum TileOrder { - TILE_CENTER = 0, - TILE_RIGHT_TO_LEFT = 1, - TILE_LEFT_TO_RIGHT = 2, - TILE_TOP_TO_BOTTOM = 3, - TILE_BOTTOM_TO_TOP = 4, - TILE_HILBERT_SPIRAL = 5, -}; - -/* Tile Manager */ +/* -------------------------------------------------------------------- + * Tile Manager. + */ class TileManager { public: - BufferParams params; - - struct State { - vector<Tile> tiles; - int tile_stride; - BufferParams buffer; - int sample; - int num_samples; - int resolution_divider; - int num_tiles; - - /* Total samples over all pixels: Generally num_samples*num_pixels, - * but can be higher due to the initial resolution division for previews. */ - uint64_t total_pixel_samples; - - /* These lists contain the indices of the tiles to be rendered/denoised and are used - * when acquiring a new tile for the device. - * Each list in each vector is for one logical device. */ - vector<list<int>> render_tiles; - vector<list<int>> denoising_tiles; - } state; - - int num_samples; - int slice_overlap; - - TileManager(bool progressive, - int num_samples, - int2 tile_size, - int start_resolution, - bool preserve_tile_device, - bool background, - TileOrder tile_order, - int num_devices = 1, - int pixel_size = 1); + /* This callback is invoked by whenever on-dist tiles storage file is closed after writing. */ + function<void(string_view)> full_buffer_written_cb; + + TileManager(); ~TileManager(); - void device_free(); - void reset(BufferParams ¶ms, int num_samples); - void set_samples(int num_samples); + TileManager(const TileManager &other) = delete; + TileManager(TileManager &&other) noexcept = delete; + TileManager &operator=(const TileManager &other) = delete; + TileManager &operator=(TileManager &&other) = delete; + + /* Reset current progress and start new rendering of the full-frame parameters in tiles of the + * given size. + * Only touches scheduling-related state of the tile manager. */ + /* TODO(sergey): Consider using tile area instead of exact size to help dealing with extreme + * cases of stretched renders. */ + void reset_scheduling(const BufferParams ¶ms, int2 tile_size); + + /* Update for the known buffer passes and scene parameters. + * Will store all parameters needed for buffers access outside of the scene graph. */ + void update(const BufferParams ¶ms, const Scene *scene); + + inline int get_num_tiles() const + { + return tile_state_.num_tiles; + } + + inline bool has_multiple_tiles() const + { + return tile_state_.num_tiles > 1; + } + bool next(); - bool next_tile(Tile *&tile, int device, uint tile_types); - bool finish_tile(const int index, const bool need_denoise, bool &delete_tile); bool done(); - bool has_tiles(); - void set_tile_order(TileOrder tile_order_) + const Tile &get_current_tile() const; + + /* Write render buffer of a tile to a file on disk. + * + * Opens file for write when first tile is written. + * + * Returns true on success. */ + bool write_tile(const RenderBuffers &tile_buffers); + + /* Inform the tile manager that no more tiles will be written to disk. + * The file will be considered final, all handles to it will be closed. */ + void finish_write_tiles(); + + /* Check whether any tile ahs been written to disk. */ + inline bool has_written_tiles() const { - tile_order = tile_order_; + return write_state_.num_tiles_written != 0; } - int get_neighbor_index(int index, int neighbor); - bool check_neighbor_state(int index, Tile::State state); + /* Read full frame render buffer from tiles file on disk. + * + * Returns true on success. */ + bool read_full_buffer_from_disk(string_view filename, + RenderBuffers *buffers, + DenoiseParams *denoise_params); - /* ** Sample range rendering. ** */ + protected: + /* Get tile configuration for its index. + * The tile index must be within [0, state_.tile_state_). */ + Tile get_tile_for_index(int index) const; - /* Start sample in the range. */ - int range_start_sample; + bool open_tile_output(); + bool close_tile_output(); - /* Number to samples in the rendering range. */ - int range_num_samples; + /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or + * several sessions. */ + string tile_file_unique_part_; - /* Get number of actual samples to render. */ - int get_num_effective_samples(); + int2 tile_size_ = make_int2(0, 0); - /* Schedule tiles for denoising after they've been rendered. */ - bool schedule_denoising; + BufferParams buffer_params_; - protected: - void set_tiles(); - - bool progressive; - int2 tile_size; - TileOrder tile_order; - int start_resolution; - int pixel_size; - int num_devices; - - /* in some cases it is important that the same tile will be returned for the same - * device it was originally generated for (i.e. viewport rendering when buffer is - * allocating once for tile and then always used by it) - * - * in other cases any tile could be handled by any device (i.e. final rendering - * without progressive refine) - */ - bool preserve_tile_device; - - /* for background render tiles should exactly match render parts generated from - * blender side, which means image first gets split into tiles and then tiles are - * assigning to render devices - * - * however viewport rendering expects tiles to be allocated in a special way, - * meaning image is being sliced horizontally first and every device handles - * its own slice - */ - bool background; - - /* Generate tile list, return number of tiles. */ - int gen_tiles(bool sliced); - void gen_render_tiles(); + /* Tile scheduling state. */ + struct { + int num_tiles_x = 0; + int num_tiles_y = 0; + int num_tiles = 0; + + int next_tile_index; + + Tile current_tile; + } tile_state_; + + /* State of tiles writing to a file on disk. */ + struct { + /* Index of a tile file used during the current session. + * This number is used for the file name construction, making it possible to render several + * scenes throughout duration of the session and keep all results available for later read + * access. */ + int tile_file_index = 0; + + string filename; + + /* Specification of the tile image which corresponds to the buffer parameters. + * Contains channels configured according to the passes configuration in the path traces. + * + * Output images are saved using this specification, input images are expected to have matched + * specification. */ + ImageSpec image_spec; + + /* Output handle for the tile file. + * + * This file can not be closed until all tiles has been provided, so the handle is stored in + * the state and is created whenever writing is requested. */ + unique_ptr<ImageOutput> tile_out; + + int num_tiles_written = 0; + } write_state_; }; CCL_NAMESPACE_END - -#endif /* __TILE_H__ */ diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt index 65a692acd03..0f6b435813f 100644 --- a/intern/cycles/test/CMakeLists.txt +++ b/intern/cycles/test/CMakeLists.txt @@ -32,6 +32,7 @@ set(INC set(ALL_CYCLES_LIBRARIES cycles_device cycles_kernel + cycles_integrator cycles_render cycles_bvh cycles_graph @@ -45,8 +46,12 @@ include_directories(${INC}) cycles_link_directories() set(SRC + integrator_adaptive_sampling_test.cpp + integrator_render_scheduler_test.cpp + integrator_tile_test.cpp render_graph_finalize_test.cpp util_aligned_malloc_test.cpp + util_math_test.cpp util_path_test.cpp util_string_test.cpp util_task_test.cpp diff --git a/intern/cycles/test/integrator_adaptive_sampling_test.cpp b/intern/cycles/test/integrator_adaptive_sampling_test.cpp new file mode 100644 index 00000000000..3ed6a23125d --- /dev/null +++ b/intern/cycles/test/integrator_adaptive_sampling_test.cpp @@ -0,0 +1,116 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "testing/testing.h" + +#include "integrator/adaptive_sampling.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +TEST(AdaptiveSampling, schedule_samples) +{ + AdaptiveSampling adaptive_sampling; + adaptive_sampling.use = true; + adaptive_sampling.min_samples = 0; + adaptive_sampling.adaptive_step = 4; + + for (int sample = 2; sample < 32; ++sample) { + for (int num_samples = 8; num_samples < 32; ++num_samples) { + const int num_samples_aligned = adaptive_sampling.align_samples(sample, num_samples); + /* NOTE: `sample + num_samples_aligned` is the number of samples after rendering, so need + * to convert this to the 0-based index of the last sample. */ + EXPECT_TRUE(adaptive_sampling.need_filter(sample + num_samples_aligned - 1)); + } + } +} + +TEST(AdaptiveSampling, align_samples) +{ + AdaptiveSampling adaptive_sampling; + adaptive_sampling.use = true; + adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */; + adaptive_sampling.adaptive_step = 4; + + /* Filtering will happen at the following samples: + * 15, 19, 23, 27, 31, 35, 39, 43 */ + + /* Requested sample and number of samples will result in number of samples lower than + * `min_samples`. */ + EXPECT_EQ(adaptive_sampling.align_samples(0, 4), 4); + EXPECT_EQ(adaptive_sampling.align_samples(0, 7), 7); + + /* Request number of samples higher than the minimum samples before filter, but prior to the + * first sample at which filtering will happen. */ + EXPECT_EQ(adaptive_sampling.align_samples(0, 15), 15); + + /* When rendering many samples from the very beginning, limit number of samples by the first + * sample at which filtering is to happen. */ + EXPECT_EQ(adaptive_sampling.align_samples(0, 16), 16); + EXPECT_EQ(adaptive_sampling.align_samples(0, 17), 16); + EXPECT_EQ(adaptive_sampling.align_samples(0, 20), 16); + EXPECT_EQ(adaptive_sampling.align_samples(0, 60), 16); + + /* Similar to above, but start sample is not 0. */ + EXPECT_EQ(adaptive_sampling.align_samples(9, 8), 7); + EXPECT_EQ(adaptive_sampling.align_samples(9, 20), 7); + EXPECT_EQ(adaptive_sampling.align_samples(9, 60), 7); + + /* Start sample is past the minimum required samples, but prior to the first filter sample. */ + EXPECT_EQ(adaptive_sampling.align_samples(12, 6), 4); + EXPECT_EQ(adaptive_sampling.align_samples(12, 20), 4); + EXPECT_EQ(adaptive_sampling.align_samples(12, 60), 4); + + /* Start sample is the sample which is to be filtered. */ + EXPECT_EQ(adaptive_sampling.align_samples(15, 4), 1); + EXPECT_EQ(adaptive_sampling.align_samples(15, 6), 1); + EXPECT_EQ(adaptive_sampling.align_samples(15, 10), 1); + EXPECT_EQ(adaptive_sampling.align_samples(58, 2), 2); + + /* Start sample is past the sample which is to be filtered. */ + EXPECT_EQ(adaptive_sampling.align_samples(16, 3), 3); + EXPECT_EQ(adaptive_sampling.align_samples(16, 4), 4); + EXPECT_EQ(adaptive_sampling.align_samples(16, 5), 4); + EXPECT_EQ(adaptive_sampling.align_samples(16, 10), 4); + + /* Should never exceed requested number of samples. */ + EXPECT_EQ(adaptive_sampling.align_samples(15, 2), 1); + EXPECT_EQ(adaptive_sampling.align_samples(16, 2), 2); + EXPECT_EQ(adaptive_sampling.align_samples(17, 2), 2); + EXPECT_EQ(adaptive_sampling.align_samples(18, 2), 2); +} + +TEST(AdaptiveSampling, need_filter) +{ + AdaptiveSampling adaptive_sampling; + adaptive_sampling.use = true; + adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */; + adaptive_sampling.adaptive_step = 4; + + const vector<int> expected_samples_to_filter = { + {15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59}}; + + vector<int> actual_samples_to_filter; + for (int sample = 0; sample < 60; ++sample) { + if (adaptive_sampling.need_filter(sample)) { + actual_samples_to_filter.push_back(sample); + } + } + + EXPECT_EQ(actual_samples_to_filter, expected_samples_to_filter); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/test/integrator_render_scheduler_test.cpp b/intern/cycles/test/integrator_render_scheduler_test.cpp new file mode 100644 index 00000000000..b4efbc2d1a7 --- /dev/null +++ b/intern/cycles/test/integrator_render_scheduler_test.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "testing/testing.h" + +#include "integrator/render_scheduler.h" + +CCL_NAMESPACE_BEGIN + +TEST(IntegratorRenderScheduler, calculate_resolution_divider_for_resolution) +{ + EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 1920), 1); + EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 960), 2); + EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 480), 4); +} + +TEST(IntegratorRenderScheduler, calculate_resolution_for_divider) +{ + EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 1), 1440); + EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 2), 720); + EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 4), 360); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp new file mode 100644 index 00000000000..5bb57b48c3c --- /dev/null +++ b/intern/cycles/test/integrator_tile_test.cpp @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "testing/testing.h" + +#include "integrator/tile.h" +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +TEST(tile_calculate_best_size, Basic) +{ + /* Make sure CPU-like case is handled properly. */ + EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1), TileSize(1, 1, 1)); + EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1), TileSize(1, 1, 1)); + + /* Enough path states to fit an entire image with all samples. */ + EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080), + TileSize(1920, 1080, 1)); + EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100), + TileSize(1920, 1080, 100)); +} + +TEST(tile_calculate_best_size, Extreme) +{ + EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072), TileSize(1, 1, 512)); + EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072), TileSize(1, 1, 1024)); + EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072), TileSize(1, 1, 4096)); + + EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024), + TileSize(1, 1, 1024)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp index da9b29314a7..19c211fe5f7 100644 --- a/intern/cycles/test/render_graph_finalize_test.cpp +++ b/intern/cycles/test/render_graph_finalize_test.cpp @@ -181,7 +181,7 @@ class RenderGraph : public testing::Test { util_logging_start(); util_logging_verbosity_set(1); - device_cpu = Device::create(device_info, stats, profiler, true); + device_cpu = Device::create(device_info, stats, profiler); scene = new Scene(scene_params, device_cpu); } diff --git a/intern/cycles/test/util_math_test.cpp b/intern/cycles/test/util_math_test.cpp new file mode 100644 index 00000000000..b6ce3ef0cf3 --- /dev/null +++ b/intern/cycles/test/util_math_test.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "testing/testing.h" + +#include "util/util_math.h" + +CCL_NAMESPACE_BEGIN + +TEST(math, next_power_of_two) +{ + EXPECT_EQ(next_power_of_two(0), 1); + EXPECT_EQ(next_power_of_two(1), 2); + EXPECT_EQ(next_power_of_two(2), 4); + EXPECT_EQ(next_power_of_two(3), 4); + EXPECT_EQ(next_power_of_two(4), 8); +} + +TEST(math, prev_power_of_two) +{ + EXPECT_EQ(prev_power_of_two(0), 0); + + EXPECT_EQ(prev_power_of_two(1), 1); + EXPECT_EQ(prev_power_of_two(2), 1); + + EXPECT_EQ(prev_power_of_two(3), 2); + EXPECT_EQ(prev_power_of_two(4), 2); + + EXPECT_EQ(prev_power_of_two(5), 4); + EXPECT_EQ(prev_power_of_two(6), 4); + EXPECT_EQ(prev_power_of_two(7), 4); + EXPECT_EQ(prev_power_of_two(8), 4); +} + +TEST(math, reverse_integer_bits) +{ + EXPECT_EQ(reverse_integer_bits(0xFFFFFFFF), 0xFFFFFFFF); + EXPECT_EQ(reverse_integer_bits(0x00000000), 0x00000000); + EXPECT_EQ(reverse_integer_bits(0x1), 0x80000000); + EXPECT_EQ(reverse_integer_bits(0x80000000), 0x1); + EXPECT_EQ(reverse_integer_bits(0xFFFF0000), 0x0000FFFF); + EXPECT_EQ(reverse_integer_bits(0x0000FFFF), 0xFFFF0000); + EXPECT_EQ(reverse_integer_bits(0x00FF0000), 0x0000FF00); + EXPECT_EQ(reverse_integer_bits(0x0000FF00), 0x00FF0000); + EXPECT_EQ(reverse_integer_bits(0xAAAAAAAA), 0x55555555); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp index 97f8daa65de..c9022d1b132 100644 --- a/intern/cycles/test/util_string_test.cpp +++ b/intern/cycles/test/util_string_test.cpp @@ -281,4 +281,40 @@ TEST(util_string_remove_trademark, r_space_middle) EXPECT_EQ(str, "foo bar baz"); } +/* ******** Tests for string_startswith() ******** */ + +TEST(string_startswith, basic) +{ + EXPECT_TRUE(string_startswith("", "")); + + EXPECT_FALSE(string_startswith("", "World")); + EXPECT_TRUE(string_startswith("Hello", "")); + + EXPECT_FALSE(string_startswith("Hello", "World")); + + EXPECT_TRUE(string_startswith("Hello", "Hello")); + EXPECT_TRUE(string_startswith("Hello", "He")); + EXPECT_TRUE(string_startswith("Hello", "H")); + + EXPECT_FALSE(string_startswith("Hello", "e")); + EXPECT_FALSE(string_startswith("Hello", "HelloWorld")); +} + +TEST(string_endswith, basic) +{ + EXPECT_TRUE(string_endswith("", "")); + + EXPECT_FALSE(string_endswith("", "World")); + EXPECT_TRUE(string_endswith("Hello", "")); + + EXPECT_FALSE(string_endswith("Hello", "World")); + + EXPECT_TRUE(string_endswith("Hello", "Hello")); + EXPECT_TRUE(string_endswith("Hello", "lo")); + EXPECT_TRUE(string_endswith("Hello", "o")); + + EXPECT_FALSE(string_endswith("Hello", "e")); + EXPECT_FALSE(string_endswith("Hello", "WorldHello")); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 13d177d2b25..de17efafcf2 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -34,56 +34,6 @@ #else /* __KERNEL_GPU__ */ -# ifdef __KERNEL_OPENCL__ - -/* Float atomics implementation credits: - * http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html - */ -ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source, - const float operand) -{ - union { - unsigned int int_value; - float float_value; - } new_value; - union { - unsigned int int_value; - float float_value; - } prev_value; - do { - prev_value.float_value = *source; - new_value.float_value = prev_value.float_value + operand; - } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source, - prev_value.int_value, - new_value.int_value) != prev_value.int_value); - return new_value.float_value; -} - -ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest, - const float old_val, - const float new_val) -{ - union { - unsigned int int_value; - float float_value; - } new_value, prev_value, result; - prev_value.float_value = old_val; - new_value.float_value = new_val; - result.int_value = atomic_cmpxchg( - (volatile ccl_global unsigned int *)dest, prev_value.int_value, new_value.int_value); - return result.float_value; -} - -# define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) -# define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) -# define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) -# define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x)) - -# define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE -# define ccl_barrier(flags) barrier(flags) - -# endif /* __KERNEL_OPENCL__ */ - # ifdef __KERNEL_CUDA__ # define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x)) diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 74ecefa1917..1d598725c84 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -26,13 +26,7 @@ CCL_NAMESPACE_BEGIN DebugFlags::CPU::CPU() - : avx2(true), - avx(true), - sse41(true), - sse3(true), - sse2(true), - bvh_layout(BVH_LAYOUT_AUTO), - split_kernel(false) + : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO) { reset(); } @@ -58,11 +52,9 @@ void DebugFlags::CPU::reset() #undef CHECK_CPU_FLAGS bvh_layout = BVH_LAYOUT_AUTO; - - split_kernel = false; } -DebugFlags::CUDA::CUDA() : adaptive_compile(false), split_kernel(false) +DebugFlags::CUDA::CUDA() : adaptive_compile(false) { reset(); } @@ -71,8 +63,6 @@ void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; - - split_kernel = false; } DebugFlags::OptiX::OptiX() @@ -82,42 +72,7 @@ DebugFlags::OptiX::OptiX() void DebugFlags::OptiX::reset() { - cuda_streams = 1; - curves_api = false; -} - -DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false) -{ - reset(); -} - -void DebugFlags::OpenCL::reset() -{ - /* Initialize device type from environment variables. */ - device_type = DebugFlags::OpenCL::DEVICE_ALL; - char *device = getenv("CYCLES_OPENCL_TEST"); - if (device) { - if (strcmp(device, "NONE") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_NONE; - } - else if (strcmp(device, "ALL") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_ALL; - } - else if (strcmp(device, "DEFAULT") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_DEFAULT; - } - else if (strcmp(device, "CPU") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_CPU; - } - else if (strcmp(device, "GPU") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_GPU; - } - else if (strcmp(device, "ACCELERATOR") == 0) { - device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR; - } - } - /* Initialize other flags from environment variables. */ - debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL); + use_debug = false; } DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false) @@ -131,7 +86,6 @@ void DebugFlags::reset() cpu.reset(); cuda.reset(); optix.reset(); - opencl.reset(); } std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) @@ -142,40 +96,13 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" - << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n" - << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; + << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; os << "OptiX flags:\n" - << " CUDA streams : " << debug_flags.optix.cuda_streams << "\n"; - - const char *opencl_device_type; - switch (debug_flags.opencl.device_type) { - case DebugFlags::OpenCL::DEVICE_NONE: - opencl_device_type = "NONE"; - break; - case DebugFlags::OpenCL::DEVICE_ALL: - opencl_device_type = "ALL"; - break; - case DebugFlags::OpenCL::DEVICE_DEFAULT: - opencl_device_type = "DEFAULT"; - break; - case DebugFlags::OpenCL::DEVICE_CPU: - opencl_device_type = "CPU"; - break; - case DebugFlags::OpenCL::DEVICE_GPU: - opencl_device_type = "GPU"; - break; - case DebugFlags::OpenCL::DEVICE_ACCELERATOR: - opencl_device_type = "ACCELERATOR"; - break; - } - os << "OpenCL flags:\n" - << " Device type : " << opencl_device_type << "\n" - << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" - << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; + << " Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n"; return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index f7e53f90f74..99e2723180c 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -79,9 +79,6 @@ class DebugFlags { * CPUs and GPUs can be selected here instead. */ BVHLayout bvh_layout; - - /* Whether split kernel is used */ - bool split_kernel; }; /* Descriptor of CUDA feature-set to be used. */ @@ -94,9 +91,6 @@ class DebugFlags { /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux atm. */ bool adaptive_compile; - - /* Whether split kernel is used */ - bool split_kernel; }; /* Descriptor of OptiX feature-set to be used. */ @@ -106,61 +100,9 @@ class DebugFlags { /* Reset flags to their defaults. */ void reset(); - /* Number of CUDA streams to launch kernels concurrently from. */ - int cuda_streams; - - /* Use OptiX curves API for hair instead of custom implementation. */ - bool curves_api; - }; - - /* Descriptor of OpenCL feature-set to be used. */ - struct OpenCL { - OpenCL(); - - /* Reset flags to their defaults. */ - void reset(); - - /* Available device types. - * Only gives a hint which devices to let user to choose from, does not - * try to use any sort of optimal device or so. - */ - enum DeviceType { - /* None of OpenCL devices will be used. */ - DEVICE_NONE, - /* All OpenCL devices will be used. */ - DEVICE_ALL, - /* Default system OpenCL device will be used. */ - DEVICE_DEFAULT, - /* Host processor will be used. */ - DEVICE_CPU, - /* GPU devices will be used. */ - DEVICE_GPU, - /* Dedicated OpenCL accelerator device will be used. */ - DEVICE_ACCELERATOR, - }; - - /* Available kernel types. */ - enum KernelType { - /* Do automated guess which kernel to use, based on the officially - * supported GPUs and such. - */ - KERNEL_DEFAULT, - /* Force mega kernel to be used. */ - KERNEL_MEGA, - /* Force split kernel to be used. */ - KERNEL_SPLIT, - }; - - /* Requested device type. */ - DeviceType device_type; - - /* Use debug version of the kernel. */ - bool debug; - - /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all - * devices. */ - /* Artificial memory limit in bytes (0 if disabled). */ - size_t mem_limit; + /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable + * validations, and lower optimization level. */ + bool use_debug; }; /* Get instance of debug flags registry. */ @@ -182,9 +124,6 @@ class DebugFlags { /* Requested OptiX flags. */ OptiX optix; - /* Requested OpenCL flags. */ - OpenCL opencl; - private: DebugFlags(); diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h index 0a239a944a5..9b1698d461a 100644 --- a/intern/cycles/util/util_defines.h +++ b/intern/cycles/util/util_defines.h @@ -43,9 +43,9 @@ # define ccl_local_param # define ccl_private # define ccl_restrict __restrict -# define ccl_ref & # define ccl_optional_struct_init # define ccl_loop_no_unroll +# define ccl_attr_maybe_unused [[maybe_unused]] # define __KERNEL_WITH_SSE_ALIGN__ # if defined(_WIN32) && !defined(FREE_WINDOWS) @@ -62,7 +62,6 @@ # define ccl_may_alias # define ccl_always_inline __forceinline # define ccl_never_inline __declspec(noinline) -# define ccl_maybe_unused # else /* _WIN32 && !FREE_WINDOWS */ # define ccl_device_inline static inline __attribute__((always_inline)) # define ccl_device_forceinline static inline __attribute__((always_inline)) @@ -74,7 +73,6 @@ # define ccl_may_alias __attribute__((__may_alias__)) # define ccl_always_inline __attribute__((always_inline)) # define ccl_never_inline __attribute__((noinline)) -# define ccl_maybe_unused __attribute__((used)) # endif /* _WIN32 && !FREE_WINDOWS */ /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index a8d4ee75e20..d9edfec5da3 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ -#ifdef __KERNEL_OPENCL__ - -# define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h); - -#else - /* CUDA has its own half data type, no need to define then */ -# ifndef __KERNEL_CUDA__ +#ifndef __KERNEL_CUDA__ /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -59,27 +53,27 @@ class half { private: unsigned short v; }; -# endif +#endif struct half4 { half x, y, z, w; }; -# ifdef __KERNEL_CUDA__ +#ifdef __KERNEL_CUDA__ -ccl_device_inline void float4_store_half(half *h, float4 f, float scale) +ccl_device_inline void float4_store_half(half *h, float4 f) { - h[0] = __float2half(f.x * scale); - h[1] = __float2half(f.y * scale); - h[2] = __float2half(f.z * scale); - h[3] = __float2half(f.w * scale); + h[0] = __float2half(f.x); + h[1] = __float2half(f.y); + h[2] = __float2half(f.z); + h[3] = __float2half(f.w); } -# else +#else -ccl_device_inline void float4_store_half(half *h, float4 f, float scale) +ccl_device_inline void float4_store_half(half *h, float4 f) { -# ifndef __KERNEL_SSE2__ +# ifndef __KERNEL_SSE2__ for (int i = 0; i < 4; i++) { /* optimized float to half for pixels: * assumes no negative, no nan, no inf, and sets denormal to 0 */ @@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) uint i; float f; } in; - float fscale = f[i] * scale; - in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f; + in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f; int x = in.i; int absolute = x & 0x7FFFFFFF; @@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) h[i] = (rshift & 0x7FFF); } -# else +# else /* same as above with SSE */ - ssef fscale = load4f(f) * scale; - ssef x = min(max(fscale, 0.0f), 65504.0f); + ssef x = min(max(load4f(f), 0.0f), 65504.0f); -# ifdef __KERNEL_AVX2__ +# ifdef __KERNEL_AVX2__ ssei rpack = _mm_cvtps_ph(x, 0); -# else +# else ssei absolute = cast(x) & 0x7FFFFFFF; ssei Z = absolute + 0xC8000000; ssei result = andnot(absolute < 0x38800000, Z); ssei rshift = (result >> 13) & 0x7FFF; ssei rpack = _mm_packs_epi32(rshift, rshift); -# endif +# endif _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack)); -# endif +# endif } ccl_device_inline float half_to_float(half h) @@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f) return (value_bits | sign_bit); } -# endif - #endif CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index c161299acd0..35c2d436d09 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -49,6 +49,7 @@ class LogMessageVoidify { # define LOG(severity) LOG_SUPPRESS() # define VLOG(severity) LOG_SUPPRESS() # define VLOG_IF(severity, condition) LOG_SUPPRESS() +# define VLOG_IS_ON(severity) false # define CHECK(expression) LOG_SUPPRESS() diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index c5996ebfcb6..6d728dde679 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -26,11 +26,9 @@ # include <cmath> #endif -#ifndef __KERNEL_OPENCL__ -# include <float.h> -# include <math.h> -# include <stdio.h> -#endif /* __KERNEL_OPENCL__ */ +#include <float.h> +#include <math.h> +#include <stdio.h> #include "util/util_types.h" @@ -86,7 +84,6 @@ CCL_NAMESPACE_BEGIN /* Scalar */ #ifdef _WIN32 -# ifndef __KERNEL_OPENCL__ ccl_device_inline float fmaxf(float a, float b) { return (a > b) ? a : b; @@ -96,8 +93,7 @@ ccl_device_inline float fminf(float a, float b) { return (a < b) ? a : b; } -# endif /* !__KERNEL_OPENCL__ */ -#endif /* _WIN32 */ +#endif /* _WIN32 */ #ifndef __KERNEL_GPU__ using std::isfinite; @@ -119,6 +115,11 @@ ccl_device_inline int min(int a, int b) return (a < b) ? a : b; } +ccl_device_inline uint min(uint a, uint b) +{ + return (a < b) ? a : b; +} + ccl_device_inline float max(float a, float b) { return (a > b) ? a : b; @@ -166,7 +167,6 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } -#ifndef __KERNEL_OPENCL__ /* Int/Float conversion */ ccl_device_inline int as_int(uint i) @@ -241,24 +241,23 @@ ccl_device_inline float __uint_as_float(uint i) ccl_device_inline int4 __float4_as_int4(float4 f) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(f.m128)); -# else +#else return make_int4( __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w)); -# endif +#endif } ccl_device_inline float4 __int4_as_float4(int4 i) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_castsi128_ps(i.m128)); -# else +#else return make_float4( __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w)); -# endif +#endif } -#endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ ccl_device_inline bool isnan_safe(float f) @@ -279,7 +278,6 @@ ccl_device_inline float ensure_finite(float v) return isfinite_safe(v) ? v : 0.0f; } -#ifndef __KERNEL_OPENCL__ ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -309,8 +307,6 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x) return result; } -#endif /* __KERNEL_OPENCL__ */ - #ifndef __KERNEL_CUDA__ ccl_device_inline float saturate(float a) { @@ -451,7 +447,6 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN -#ifndef __KERNEL_OPENCL__ /* Interpolation */ template<class A, class B> A lerp(const A &a, const A &b, const B &t) @@ -459,15 +454,9 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t) return (A)(a * ((B)1 - t) + b * t); } -#endif /* __KERNEL_OPENCL__ */ - /* Triangle */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3) -#else -ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3) -#endif { return len(cross(v3 - v2, v1 - v2)) * 0.5f; } @@ -665,11 +654,7 @@ ccl_device_inline float pow22(float a) ccl_device_inline float beta(float x, float y) { -#ifndef __KERNEL_OPENCL__ return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y)); -#else - return expf(lgamma(x) + lgamma(y) - lgamma(x + y)); -#endif } ccl_device_inline float xor_signmask(float x, int y) @@ -686,8 +671,6 @@ ccl_device_inline uint count_leading_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) return __clz(x); -#elif defined(__KERNEL_OPENCL__) - return clz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -704,8 +687,6 @@ ccl_device_inline uint count_trailing_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) return (__ffs(x) - 1); -#elif defined(__KERNEL_OPENCL__) - return (31 - count_leading_zeros(x & -x)); #else assert(x != 0); # ifdef _MSC_VER @@ -722,8 +703,6 @@ ccl_device_inline uint find_first_set(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) return __ffs(x); -#elif defined(__KERNEL_OPENCL__) - return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0; #else # ifdef _MSC_VER return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0; @@ -797,6 +776,52 @@ ccl_device_inline float precise_angle(float3 a, float3 b) return 2.0f * atan2f(len(a - b), len(a + b)); } +/* Return value which is greater than the given one and is a power of two. */ +ccl_device_inline uint next_power_of_two(uint x) +{ + return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x)); +} + +/* Return value which is lower than the given one and is a power of two. */ +ccl_device_inline uint prev_power_of_two(uint x) +{ + return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1)); +} + +#ifndef __has_builtin +# define __has_builtin(v) 0 +#endif + +/* Reverses the bits of a 32 bit integer. */ +ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) +{ + /* Use a native instruction if it exists. */ +#if defined(__arm__) || defined(__aarch64__) + __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x)); + return x; +#elif defined(__KERNEL_CUDA__) + return __brev(x); +#elif __has_builtin(__builtin_bitreverse32) + return __builtin_bitreverse32(x); +#else + /* Flip pairwise. */ + x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1); + /* Flip pairs. */ + x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2); + /* Flip nibbles. */ + x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4); + /* Flip bytes. CPUs have an instruction for that, pretty fast one. */ +# ifdef _MSC_VER + return _byteswap_ulong(x); +# elif defined(__INTEL_COMPILER) + return (uint32_t)_bswap((int)x); +# else + /* Assuming gcc or clang. */ + return __builtin_bswap32(x); +# endif +#endif +} + CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h index 17f6f3c9382..70b80c33544 100644 --- a/intern/cycles/util/util_math_float2.h +++ b/intern/cycles/util/util_math_float2.h @@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN * Declaration. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline float2 operator-(const float2 &a); ccl_device_inline float2 operator*(const float2 &a, const float2 &b); ccl_device_inline float2 operator*(const float2 &a, float f); @@ -64,7 +63,6 @@ ccl_device_inline float2 fabs(const float2 &a); ccl_device_inline float2 as_float2(const float4 &a); ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t); ccl_device_inline float2 floor(const float2 &a); -#endif /* !__KERNEL_OPENCL__ */ ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b); @@ -82,7 +80,6 @@ ccl_device_inline float2 one_float2() return make_float2(1.0f, 1.0f); } -#ifndef __KERNEL_OPENCL__ ccl_device_inline float2 operator-(const float2 &a) { return make_float2(-a.x, -a.y); @@ -262,8 +259,6 @@ ccl_device_inline float2 floor(const float2 &a) return make_float2(floorf(a.x), floorf(a.y)); } -#endif /* !__KERNEL_OPENCL__ */ - ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b) { return (b != 0.0f) ? a / b : zero_float2(); diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index 9673c043189..30a1b4c3f77 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN * Declaration. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline float3 operator-(const float3 &a); ccl_device_inline float3 operator*(const float3 &a, const float3 &b); ccl_device_inline float3 operator*(const float3 &a, const float f); @@ -63,7 +62,6 @@ ccl_device_inline float3 rcp(const float3 &a); ccl_device_inline float3 sqrt(const float3 &a); ccl_device_inline float3 floor(const float3 &a); ccl_device_inline float3 ceil(const float3 &a); -#endif /* !__KERNEL_OPENCL__ */ ccl_device_inline float min3(float3 a); ccl_device_inline float max3(float3 a); @@ -105,50 +103,49 @@ ccl_device_inline float3 one_float3() return make_float3(1.0f, 1.0f, 1.0f); } -#ifndef __KERNEL_OPENCL__ ccl_device_inline float3 operator-(const float3 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -# else +#else return make_float3(-a.x, -a.y, -a.z); -# endif +#endif } ccl_device_inline float3 operator*(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, b.m128)); -# else +#else return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); -# endif +#endif } ccl_device_inline float3 operator*(const float3 &a, const float f) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); -# else +#else return make_float3(a.x * f, a.y * f, a.z * f); -# endif +#endif } ccl_device_inline float3 operator*(const float f, const float3 &a) { -# if defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -# else +#else return make_float3(a.x * f, a.y * f, a.z * f); -# endif +#endif } ccl_device_inline float3 operator/(const float f, const float3 &a) { -# if defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE__) return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); -# else +#else return make_float3(f / a.x, f / a.y, f / a.z); -# endif +#endif } ccl_device_inline float3 operator/(const float3 &a, const float f) @@ -159,11 +156,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f) ccl_device_inline float3 operator/(const float3 &a, const float3 &b) { -# if defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE__) return float3(_mm_div_ps(a.m128, b.m128)); -# else +#else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -# endif +#endif } ccl_device_inline float3 operator+(const float3 &a, const float f) @@ -173,11 +170,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f) ccl_device_inline float3 operator+(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_add_ps(a.m128, b.m128)); -# else +#else return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -# endif +#endif } ccl_device_inline float3 operator-(const float3 &a, const float f) @@ -187,11 +184,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f) ccl_device_inline float3 operator-(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_sub_ps(a.m128, b.m128)); -# else +#else return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -# endif +#endif } ccl_device_inline float3 operator+=(float3 &a, const float3 &b) @@ -227,11 +224,11 @@ ccl_device_inline float3 operator/=(float3 &a, float f) ccl_device_inline bool operator==(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -# else +#else return (a.x == b.x && a.y == b.y && a.z == b.z); -# endif +#endif } ccl_device_inline bool operator!=(const float3 &a, const float3 &b) @@ -246,20 +243,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b) ccl_device_inline float dot(const float3 &a, const float3 &b) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -# else +#else return a.x * b.x + a.y * b.y + a.z * b.z; -# endif +#endif } ccl_device_inline float dot_xy(const float3 &a, const float3 &b) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b)); -# else +#else return a.x * b.x + a.y * b.y; -# endif +#endif } ccl_device_inline float3 cross(const float3 &a, const float3 &b) @@ -270,30 +267,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b) ccl_device_inline float3 normalize(const float3 &a) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); return float3(_mm_div_ps(a.m128, norm)); -# else +#else return a / len(a); -# endif +#endif } ccl_device_inline float3 min(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_min_ps(a.m128, b.m128)); -# else +#else return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -# endif +#endif } ccl_device_inline float3 max(const float3 &a, const float3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_max_ps(a.m128, b.m128)); -# else +#else return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -# endif +#endif } ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx) @@ -303,43 +300,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 & ccl_device_inline float3 fabs(const float3 &a) { -# ifdef __KERNEL_SSE__ -# ifdef __KERNEL_NEON__ +#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ return float3(vabsq_f32(a.m128)); -# else +# else __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return float3(_mm_and_ps(a.m128, mask)); -# endif -# else - return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); # endif +#else + return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); +#endif } ccl_device_inline float3 sqrt(const float3 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_sqrt_ps(a)); -# else +#else return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z)); -# endif +#endif } ccl_device_inline float3 floor(const float3 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_floor_ps(a)); -# else +#else return make_float3(floorf(a.x), floorf(a.y), floorf(a.z)); -# endif +#endif } ccl_device_inline float3 ceil(const float3 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float3(_mm_ceil_ps(a)); -# else +#else return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z)); -# endif +#endif } ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) @@ -349,14 +346,13 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) ccl_device_inline float3 rcp(const float3 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -# else +#else return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); -# endif +#endif } -#endif /* !__KERNEL_OPENCL__ */ ccl_device_inline float min3(float3 a) { @@ -483,11 +479,7 @@ ccl_device_inline float average(const float3 a) ccl_device_inline bool isequal_float3(const float3 a, const float3 b) { -#ifdef __KERNEL_OPENCL__ - return all(a == b); -#else return a == b; -#endif } ccl_device_inline float3 pow3(float3 v, float e) diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index 0ba2bafa2f0..19af5c8c638 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN * Declaration. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline float4 operator-(const float4 &a); ccl_device_inline float4 operator*(const float4 &a, const float4 &b); ccl_device_inline float4 operator*(const float4 &a, float f); @@ -66,7 +65,6 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a); ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); -#endif /* !__KERNEL_OPENCL__*/ ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b); @@ -112,33 +110,32 @@ ccl_device_inline float4 one_float4() return make_float4(1.0f, 1.0f, 1.0f, 1.0f); } -#ifndef __KERNEL_OPENCL__ ccl_device_inline float4 operator-(const float4 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return float4(_mm_xor_ps(a.m128, mask)); -# else +#else return make_float4(-a.x, -a.y, -a.z, -a.w); -# endif +#endif } ccl_device_inline float4 operator*(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_mul_ps(a.m128, b.m128)); -# else +#else return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); -# endif +#endif } ccl_device_inline float4 operator*(const float4 &a, float f) { -# if defined(__KERNEL_SSE__) +#if defined(__KERNEL_SSE__) return a * make_float4(f); -# else +#else return make_float4(a.x * f, a.y * f, a.z * f, a.w * f); -# endif +#endif } ccl_device_inline float4 operator*(float f, const float4 &a) @@ -153,11 +150,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f) ccl_device_inline float4 operator/(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_div_ps(a.m128, b.m128)); -# else +#else return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); -# endif +#endif } ccl_device_inline float4 operator+(const float4 &a, const float f) @@ -167,11 +164,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f) ccl_device_inline float4 operator+(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_add_ps(a.m128, b.m128)); -# else +#else return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -# endif +#endif } ccl_device_inline float4 operator-(const float4 &a, const float f) @@ -181,11 +178,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f) ccl_device_inline float4 operator-(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_sub_ps(a.m128, b.m128)); -# else +#else return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); -# endif +#endif } ccl_device_inline float4 operator+=(float4 &a, const float4 &b) @@ -215,38 +212,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f) ccl_device_inline int4 operator<(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); -# else +#else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -# endif +#endif } ccl_device_inline int4 operator>=(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); -# else +#else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -# endif +#endif } ccl_device_inline int4 operator<=(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); -# else +#else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -# endif +#endif } ccl_device_inline bool operator==(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -# else +#else return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -# endif +#endif } ccl_device_inline float distance(const float4 &a, const float4 &b) @@ -256,16 +253,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b) ccl_device_inline float dot(const float4 &a, const float4 &b) { -# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) __m128 t = vmulq_f32(a, b); return vaddvq_f32(t); -# else - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -# endif # else - return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); # endif +#else + return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); +#endif } ccl_device_inline float len_squared(const float4 &a) @@ -275,21 +272,21 @@ ccl_device_inline float len_squared(const float4 &a) ccl_device_inline float4 rcp(const float4 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -# else +#else return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); -# endif +#endif } ccl_device_inline float4 sqrt(const float4 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_sqrt_ps(a.m128)); -# else +#else return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); -# endif +#endif } ccl_device_inline float4 sqr(const float4 &a) @@ -299,39 +296,39 @@ ccl_device_inline float4 sqr(const float4 &a) ccl_device_inline float4 cross(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) - (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b)); -# else +#else return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); -# endif +#endif } ccl_device_inline bool is_zero(const float4 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return a == make_float4(0.0f); -# else +#else return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -# endif +#endif } ccl_device_inline float4 reduce_add(const float4 &a) { -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) +# elif defined(__KERNEL_SSE3__) float4 h(_mm_hadd_ps(a.m128, a.m128)); return float4(_mm_hadd_ps(h.m128, h.m128)); -# else +# else float4 h(shuffle<1, 0, 3, 2>(a) + a); return shuffle<2, 3, 0, 1>(h) + h; -# endif -# else +# endif +#else float sum = (a.x + a.y) + (a.z + a.w); return make_float4(sum, sum, sum, sum); -# endif +#endif } ccl_device_inline float average(const float4 &a) @@ -357,20 +354,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a) ccl_device_inline float4 min(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_min_ps(a.m128, b.m128)); -# else +#else return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -# endif +#endif } ccl_device_inline float4 max(const float4 &a, const float4 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_max_ps(a.m128, b.m128)); -# else +#else return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -# endif +#endif } ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx) @@ -380,24 +377,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a) { -# if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vabsq_f32(a)); -# else - return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); -# endif # else - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); # endif +#else + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +#endif } ccl_device_inline float4 floor(const float4 &a) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return float4(_mm_floor_ps(a)); -# else +#else return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); -# endif +#endif } ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) @@ -405,8 +402,6 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } -#endif /* !__KERNEL_OPENCL__*/ - #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h index 0295cd51f7e..5782b878801 100644 --- a/intern/cycles/util/util_math_int2.h +++ b/intern/cycles/util/util_math_int2.h @@ -27,20 +27,17 @@ CCL_NAMESPACE_BEGIN * Declaration. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline bool operator==(const int2 a, const int2 b); ccl_device_inline int2 operator+(const int2 &a, const int2 &b); ccl_device_inline int2 operator+=(int2 &a, const int2 &b); ccl_device_inline int2 operator-(const int2 &a, const int2 &b); ccl_device_inline int2 operator*(const int2 &a, const int2 &b); ccl_device_inline int2 operator/(const int2 &a, const int2 &b); -#endif /* !__KERNEL_OPENCL__ */ /******************************************************************************* * Definition. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline bool operator==(const int2 a, const int2 b) { return (a.x == b.x && a.y == b.y); @@ -70,7 +67,6 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b) { return make_int2(a.x / b.x, a.y / b.y); } -#endif /* !__KERNEL_OPENCL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h index d92ed895dc2..e0dfae7c015 100644 --- a/intern/cycles/util/util_math_int3.h +++ b/intern/cycles/util/util_math_int3.h @@ -27,52 +27,49 @@ CCL_NAMESPACE_BEGIN * Declaration. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline int3 min(int3 a, int3 b); ccl_device_inline int3 max(int3 a, int3 b); ccl_device_inline int3 clamp(const int3 &a, int mn, int mx); ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx); -#endif /* !__KERNEL_OPENCL__ */ /******************************************************************************* * Definition. */ -#ifndef __KERNEL_OPENCL__ ccl_device_inline int3 min(int3 a, int3 b) { -# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_min_epi32(a.m128, b.m128)); -# else +#else return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -# endif +#endif } ccl_device_inline int3 max(int3 a, int3 b) { -# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_max_epi32(a.m128, b.m128)); -# else +#else return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -# endif +#endif } ccl_device_inline int3 clamp(const int3 &a, int mn, int mx) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return min(max(a, make_int3(mn)), make_int3(mx)); -# else +#else return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -# endif +#endif } ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return min(max(a, mn), make_int3(mx)); -# else +#else return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -# endif +#endif } ccl_device_inline bool operator==(const int3 &a, const int3 &b) @@ -92,22 +89,21 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b) ccl_device_inline int3 operator+(const int3 &a, const int3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int3(_mm_add_epi32(a.m128, b.m128)); -# else +#else return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); -# endif +#endif } ccl_device_inline int3 operator-(const int3 &a, const int3 &b) { -# ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE__ return int3(_mm_sub_epi32(a.m128, b.m128)); -# else +#else return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); -# endif +#endif } -#endif /* !__KERNEL_OPENCL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 8905c8bc7f0..c78f4615013 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -66,6 +66,7 @@ typedef struct stat path_stat_t; static string cached_path = ""; static string cached_user_path = ""; +static string cached_temp_path = ""; static string cached_xdg_cache_path = ""; namespace { @@ -335,10 +336,11 @@ static string path_xdg_cache_get() } #endif -void path_init(const string &path, const string &user_path) +void path_init(const string &path, const string &user_path, const string &temp_path) { cached_path = path; cached_user_path = user_path; + cached_temp_path = temp_path; #ifdef _MSC_VER // workaround for https://svn.boost.org/trac/boost/ticket/6320 @@ -382,6 +384,15 @@ string path_cache_get(const string &sub) #endif } +string path_temp_get(const string &sub) +{ + if (cached_temp_path == "") { + cached_temp_path = Filesystem::temp_directory_path(); + } + + return path_join(cached_temp_path, sub); +} + #if defined(__linux__) || defined(__APPLE__) string path_xdg_home_get(const string &sub = ""); #endif @@ -739,177 +750,6 @@ bool path_remove(const string &path) return remove(path.c_str()) == 0; } -struct SourceReplaceState { - typedef map<string, string> ProcessedMapping; - /* Base director for all relative include headers. */ - string base; - /* Result of processed files. */ - ProcessedMapping processed_files; - /* Set of files which are considered "precompiled" and which are replaced - * with and empty string on a subsequent occurrence in include statement. - */ - set<string> precompiled_headers; -}; - -static string path_source_replace_includes_recursive(const string &source, - const string &source_filepath, - SourceReplaceState *state); - -static string line_directive(const SourceReplaceState &state, const string &path, const int line) -{ - string unescaped_path = path; - /* First we make path relative. */ - if (string_startswith(unescaped_path, state.base.c_str())) { - const string base_file = path_filename(state.base); - const size_t base_len = state.base.length(); - unescaped_path = base_file + - unescaped_path.substr(base_len, unescaped_path.length() - base_len); - } - /* Second, we replace all unsafe characters. */ - const size_t length = unescaped_path.length(); - string escaped_path = ""; - for (size_t i = 0; i < length; ++i) { - const char ch = unescaped_path[i]; - if (strchr("\"\'\?\\", ch) != NULL) { - escaped_path += "\\"; - } - escaped_path += ch; - } - /* TODO(sergey): Check whether using std::to_string combined with several - * concatenation operations is any faster. - */ - return string_printf("#line %d \"%s\"", line, escaped_path.c_str()); -} - -static string path_source_handle_preprocessor(const string &preprocessor_line, - const string &source_filepath, - const size_t line_number, - SourceReplaceState *state) -{ - string result = preprocessor_line; - string token = string_strip(preprocessor_line.substr(1, preprocessor_line.size() - 1)); - if (string_startswith(token, "include")) { - token = string_strip(token.substr(7, token.size() - 7)); - if (token[0] == '"') { - const size_t n_start = 1; - const size_t n_end = token.find("\"", n_start); - const string filename = token.substr(n_start, n_end - n_start); - const bool is_precompiled = string_endswith(token, "// PRECOMPILED"); - string filepath = path_join(state->base, filename); - if (!path_exists(filepath)) { - filepath = path_join(path_dirname(source_filepath), filename); - } - if (is_precompiled) { - state->precompiled_headers.insert(filepath); - } - string text; - if (path_read_text(filepath, text)) { - text = path_source_replace_includes_recursive(text, filepath, state); - /* Use line directives for better error messages. */ - result = line_directive(*state, filepath, 1) + "\n" + text + "\n" + - line_directive(*state, source_filepath, line_number + 1); - } - } - } - return result; -} - -/* Our own little c preprocessor that replaces #includes with the file - * contents, to work around issue of OpenCL drivers not supporting - * include paths with spaces in them. - */ -static string path_source_replace_includes_recursive(const string &source, - const string &source_filepath, - SourceReplaceState *state) -{ - /* Try to re-use processed file without spending time on replacing all - * include directives again. - */ - SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find( - source_filepath); - if (replaced_file != state->processed_files.end()) { - if (state->precompiled_headers.find(source_filepath) != state->precompiled_headers.end()) { - return ""; - } - return replaced_file->second; - } - /* Perform full file processing. */ - string result = ""; - const size_t source_length = source.length(); - size_t index = 0; - /* Information about where we are in the source. */ - size_t line_number = 0, column_number = 1; - /* Currently gathered non-preprocessor token. - * Store as start/length rather than token itself to avoid overhead of - * memory re-allocations on each character concatenation. - */ - size_t token_start = 0, token_length = 0; - /* Denotes whether we're inside of preprocessor line, together with - * preprocessor line itself. - * - * TODO(sergey): Investigate whether using token start/end position - * gives measurable speedup. - */ - bool inside_preprocessor = false; - string preprocessor_line = ""; - /* Actual loop over the whole source. */ - while (index < source_length) { - const char ch = source[index]; - if (ch == '\n') { - if (inside_preprocessor) { - result += path_source_handle_preprocessor( - preprocessor_line, source_filepath, line_number, state); - /* Start gathering net part of the token. */ - token_start = index; - token_length = 0; - } - inside_preprocessor = false; - preprocessor_line = ""; - column_number = 0; - ++line_number; - } - else if (ch == '#' && column_number == 1 && !inside_preprocessor) { - /* Append all possible non-preprocessor token to the result. */ - if (token_length != 0) { - result.append(source, token_start, token_length); - token_start = index; - token_length = 0; - } - inside_preprocessor = true; - } - if (inside_preprocessor) { - preprocessor_line += ch; - } - else { - ++token_length; - } - ++index; - ++column_number; - } - /* Append possible tokens which happened before special events handled - * above. - */ - if (token_length != 0) { - result.append(source, token_start, token_length); - } - if (inside_preprocessor) { - result += path_source_handle_preprocessor( - preprocessor_line, source_filepath, line_number, state); - } - /* Store result for further reuse. */ - state->processed_files[source_filepath] = result; - return result; -} - -string path_source_replace_includes(const string &source, - const string &path, - const string &source_filename) -{ - SourceReplaceState state; - state.base = path; - return path_source_replace_includes_recursive(source, path_join(path, source_filename), &state); -} - FILE *path_fopen(const string &path, const string &mode) { #ifdef _WIN32 diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h index 7a83c2135a4..f899bc2e01c 100644 --- a/intern/cycles/util/util_path.h +++ b/intern/cycles/util/util_path.h @@ -32,9 +32,10 @@ CCL_NAMESPACE_BEGIN /* program paths */ -void path_init(const string &path = "", const string &user_path = ""); +void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = ""); string path_get(const string &sub = ""); string path_user_get(const string &sub = ""); +string path_temp_get(const string &sub = ""); string path_cache_get(const string &sub = ""); /* path string manipulation */ @@ -65,11 +66,6 @@ bool path_read_text(const string &path, string &text); /* File manipulation. */ bool path_remove(const string &path); -/* source code utility */ -string path_source_replace_includes(const string &source, - const string &path, - const string &source_filename = ""); - /* cache utility */ void path_cache_clear_except(const string &name, const set<string> &except); diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp index 073b09f719f..5343f076e22 100644 --- a/intern/cycles/util/util_profiling.cpp +++ b/intern/cycles/util/util_profiling.cpp @@ -48,13 +48,7 @@ void Profiler::run() } if (cur_shader >= 0 && cur_shader < shader_samples.size()) { - /* Only consider the active shader during events whose runtime significantly depends on it. - */ - if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) || - ((cur_event >= PROFILING_CLOSURE_EVAL) && - (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) { - shader_samples[cur_shader]++; - } + shader_samples[cur_shader]++; } if (cur_object >= 0 && cur_object < object_samples.size()) { diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h index ceec08ed894..96bb682c50e 100644 --- a/intern/cycles/util/util_profiling.h +++ b/intern/cycles/util/util_profiling.h @@ -28,38 +28,30 @@ CCL_NAMESPACE_BEGIN enum ProfilingEvent : uint32_t { PROFILING_UNKNOWN, PROFILING_RAY_SETUP, - PROFILING_PATH_INTEGRATE, - PROFILING_SCENE_INTERSECT, - PROFILING_INDIRECT_EMISSION, - PROFILING_VOLUME, - PROFILING_SHADER_SETUP, - PROFILING_SHADER_EVAL, - PROFILING_SHADER_APPLY, - PROFILING_AO, - PROFILING_SUBSURFACE, - PROFILING_CONNECT_LIGHT, - PROFILING_SURFACE_BOUNCE, - PROFILING_WRITE_RESULT, - - PROFILING_INTERSECT, - PROFILING_INTERSECT_LOCAL, - PROFILING_INTERSECT_SHADOW_ALL, - PROFILING_INTERSECT_VOLUME, - PROFILING_INTERSECT_VOLUME_ALL, - - PROFILING_CLOSURE_EVAL, - PROFILING_CLOSURE_SAMPLE, - PROFILING_CLOSURE_VOLUME_EVAL, - PROFILING_CLOSURE_VOLUME_SAMPLE, - - PROFILING_DENOISING, - PROFILING_DENOISING_CONSTRUCT_TRANSFORM, - PROFILING_DENOISING_RECONSTRUCT, - PROFILING_DENOISING_DIVIDE_SHADOW, - PROFILING_DENOISING_NON_LOCAL_MEANS, - PROFILING_DENOISING_COMBINE_HALVES, - PROFILING_DENOISING_GET_FEATURE, - PROFILING_DENOISING_DETECT_OUTLIERS, + + PROFILING_INTERSECT_CLOSEST, + PROFILING_INTERSECT_SUBSURFACE, + PROFILING_INTERSECT_SHADOW, + PROFILING_INTERSECT_VOLUME_STACK, + + PROFILING_SHADE_SURFACE_SETUP, + PROFILING_SHADE_SURFACE_EVAL, + PROFILING_SHADE_SURFACE_DIRECT_LIGHT, + PROFILING_SHADE_SURFACE_INDIRECT_LIGHT, + PROFILING_SHADE_SURFACE_AO, + PROFILING_SHADE_SURFACE_PASSES, + + PROFILING_SHADE_VOLUME_SETUP, + PROFILING_SHADE_VOLUME_INTEGRATE, + PROFILING_SHADE_VOLUME_DIRECT_LIGHT, + PROFILING_SHADE_VOLUME_INDIRECT_LIGHT, + + PROFILING_SHADE_SHADOW_SETUP, + PROFILING_SHADE_SHADOW_SURFACE, + PROFILING_SHADE_SHADOW_VOLUME, + + PROFILING_SHADE_LIGHT_SETUP, + PROFILING_SHADE_LIGHT_EVAL, PROFILING_NUM_EVENTS, }; @@ -136,37 +128,51 @@ class ProfilingHelper { state->event = event; } + ~ProfilingHelper() + { + state->event = previous_event; + } + inline void set_event(ProfilingEvent event) { state->event = event; } - inline void set_shader(int shader) + protected: + ProfilingState *state; + uint32_t previous_event; +}; + +class ProfilingWithShaderHelper : public ProfilingHelper { + public: + ProfilingWithShaderHelper(ProfilingState *state, ProfilingEvent event) + : ProfilingHelper(state, event) { - state->shader = shader; - if (state->active) { - assert(shader < state->shader_hits.size()); - state->shader_hits[shader]++; - } } - inline void set_object(int object) + ~ProfilingWithShaderHelper() { - state->object = object; - if (state->active) { - assert(object < state->object_hits.size()); - state->object_hits[object]++; - } + state->object = -1; + state->shader = -1; } - ~ProfilingHelper() + inline void set_shader(int object, int shader) { - state->event = previous_event; + if (state->active) { + state->shader = shader; + state->object = object; + + if (shader >= 0) { + assert(shader < state->shader_hits.size()); + state->shader_hits[shader]++; + } + + if (object >= 0) { + assert(object < state->object_hits.size()); + state->object_hits[object]++; + } + } } - - private: - ProfilingState *state; - uint32_t previous_event; }; CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 26534a29dfe..dca8d3d0ab5 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -46,7 +46,6 @@ class Progress { substatus = ""; sync_status = ""; sync_substatus = ""; - kernel_status = ""; update_cb = function_null; cancel = false; cancel_message = ""; @@ -87,7 +86,6 @@ class Progress { substatus = ""; sync_status = ""; sync_substatus = ""; - kernel_status = ""; cancel = false; cancel_message = ""; error = false; @@ -316,24 +314,6 @@ class Progress { } } - /* kernel status */ - - void set_kernel_status(const string &kernel_status_) - { - { - thread_scoped_lock lock(progress_mutex); - kernel_status = kernel_status_; - } - - set_update(); - } - - void get_kernel_status(string &kernel_status_) - { - thread_scoped_lock lock(progress_mutex); - kernel_status_ = kernel_status; - } - /* callback */ void set_update() @@ -378,8 +358,6 @@ class Progress { string sync_status; string sync_substatus; - string kernel_status; - volatile bool cancel; string cancel_message; diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 8e8caa98a1b..b4a153c329f 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -61,14 +61,14 @@ static struct TrueTy { { return true; } -} True ccl_maybe_unused; +} True ccl_attr_maybe_unused; static struct FalseTy { __forceinline operator bool() const { return false; } -} False ccl_maybe_unused; +} False ccl_attr_maybe_unused; static struct ZeroTy { __forceinline operator float() const @@ -79,7 +79,7 @@ static struct ZeroTy { { return 0; } -} zero ccl_maybe_unused; +} zero ccl_attr_maybe_unused; static struct OneTy { __forceinline operator float() const @@ -90,7 +90,7 @@ static struct OneTy { { return 1; } -} one ccl_maybe_unused; +} one ccl_attr_maybe_unused; static struct NegInfTy { __forceinline operator float() const @@ -101,7 +101,7 @@ static struct NegInfTy { { return std::numeric_limits<int>::min(); } -} neg_inf ccl_maybe_unused; +} neg_inf ccl_attr_maybe_unused; static struct PosInfTy { __forceinline operator float() const @@ -112,10 +112,10 @@ static struct PosInfTy { { return std::numeric_limits<int>::max(); } -} inf ccl_maybe_unused, pos_inf ccl_maybe_unused; +} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused; static struct StepTy { -} step ccl_maybe_unused; +} step ccl_attr_maybe_unused; #endif diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h index d809f2e06d7..7df52d462b7 100644 --- a/intern/cycles/util/util_static_assert.h +++ b/intern/cycles/util/util_static_assert.h @@ -24,9 +24,9 @@ CCL_NAMESPACE_BEGIN -#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC) +#if defined(CYCLES_CUBIN_CC) # define static_assert(statement, message) -#endif /* __KERNEL_OPENCL__ */ +#endif #define static_assert_align(st, align) \ static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned") // NOLINT diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp index 4dfebf14923..9c0b2ca50bb 100644 --- a/intern/cycles/util/util_string.cpp +++ b/intern/cycles/util/util_string.cpp @@ -17,6 +17,9 @@ #include <stdarg.h> #include <stdio.h> +#include <algorithm> +#include <cctype> + #include "util/util_foreach.h" #include "util/util_string.h" #include "util/util_windows.h" @@ -107,24 +110,26 @@ void string_split(vector<string> &tokens, } } -bool string_startswith(const string &s, const char *start) +bool string_startswith(const string_view s, const string_view start) { - size_t len = strlen(start); + const size_t len = start.size(); - if (len > s.size()) - return 0; - else - return strncmp(s.c_str(), start, len) == 0; + if (len > s.size()) { + return false; + } + + return strncmp(s.c_str(), start.data(), len) == 0; } -bool string_endswith(const string &s, const string &end) +bool string_endswith(const string_view s, const string_view end) { - size_t len = end.length(); + const size_t len = end.size(); - if (len > s.size()) - return 0; - else - return s.compare(s.length() - len, len, end) == 0; + if (len > s.size()) { + return false; + } + + return strncmp(s.c_str() + s.size() - len, end.data(), len) == 0; } string string_strip(const string &s) @@ -172,6 +177,13 @@ string to_string(const char *str) return string(str); } +string string_to_lower(const string &s) +{ + string r = s; + std::transform(r.begin(), r.end(), r.begin(), [](char c) { return std::tolower(c); }); + return r; +} + /* Wide char strings helpers for Windows. */ #ifdef _WIN32 diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h index f2272819b2f..55462cfd8b8 100644 --- a/intern/cycles/util/util_string.h +++ b/intern/cycles/util/util_string.h @@ -21,6 +21,11 @@ #include <string.h> #include <string> +/* Use string view implementation from OIIO. + * Ideally, need to switch to `std::string_view`, but this first requires getting rid of using + * namespace OIIO as it causes symbol collision. */ +#include <OpenImageIO/string_view.h> + #include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -31,6 +36,8 @@ using std::string; using std::stringstream; using std::to_string; +using OIIO::string_view; + #ifdef __GNUC__ # define PRINTF_ATTRIBUTE __attribute__((format(printf, 1, 2))) #else @@ -45,12 +52,13 @@ void string_split(vector<string> &tokens, const string &separators = "\t ", bool skip_empty_tokens = true); void string_replace(string &haystack, const string &needle, const string &other); -bool string_startswith(const string &s, const char *start); -bool string_endswith(const string &s, const string &end); +bool string_startswith(string_view s, string_view start); +bool string_endswith(string_view s, string_view end); string string_strip(const string &s); string string_remove_trademark(const string &s); string string_from_bool(const bool var); string to_string(const char *str); +string string_to_lower(const string &s); /* Wide char strings are only used on Windows to deal with non-ASCII * characters in file names and such. No reason to use such strings diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index b010881058b..be8c2fb505a 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -403,4 +403,13 @@ size_t system_physical_ram() #endif } +uint64_t system_self_process_id() +{ +#ifdef _WIN32 + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index c4db8b74339..a1797e6ca44 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -65,6 +65,9 @@ size_t system_physical_ram(); /* Start a new process of the current application with the given arguments. */ bool system_call_self(const vector<string> &args); +/* Get identifier of the currently running process. */ +uint64_t system_self_process_id(); + CCL_NAMESPACE_END #endif /* __UTIL_SYSTEM_H__ */ diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h index 73e0f92d19c..8f84377ac8c 100644 --- a/intern/cycles/util/util_tbb.h +++ b/intern/cycles/util/util_tbb.h @@ -23,6 +23,7 @@ #include <tbb/enumerable_thread_specific.h> #include <tbb/parallel_for.h> +#include <tbb/parallel_for_each.h> #include <tbb/task_arena.h> #include <tbb/task_group.h> diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index 71bf9c65911..4de66bf5f46 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -85,8 +85,6 @@ typedef struct TextureInfo { uint64_t data; /* Data Type */ uint data_type; - /* Buffer number for OpenCL. */ - uint cl_buffer; /* Interpolation and extension type. */ uint interpolation, extension; /* Dimensions. */ diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h index f79eac4cbcf..e9cd3b0b483 100644 --- a/intern/cycles/util/util_transform.h +++ b/intern/cycles/util/util_transform.h @@ -498,36 +498,12 @@ Transform transform_from_viewplane(BoundBox2D &viewplane); #endif -/* TODO(sergey): This is only for until we've got OpenCL 2.0 - * on all devices we consider supported. It'll be replaced with - * generic address space. - */ +/* TODO: This can be removed when we know if no devices will require explicit + * address space qualifiers for this case. */ -#ifdef __KERNEL_OPENCL__ - -# define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a##b -# define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \ - ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \ - ccl_addr_space const Transform *t, const float3 a) \ - { \ - Transform private_tfm = *t; \ - return function(&private_tfm, a); \ - } - -OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point) -OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction) -OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed) - -# undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE -# undef OPENCL_TRANSFORM_ADDRSPACE_GLUE -# define transform_point_auto transform_point_addrspace -# define transform_direction_auto transform_direction_addrspace -# define transform_direction_transposed_auto transform_direction_transposed_addrspace -#else -# define transform_point_auto transform_point -# define transform_direction_auto transform_direction -# define transform_direction_transposed_auto transform_direction_transposed -#endif +#define transform_point_auto transform_point +#define transform_direction_auto transform_direction +#define transform_direction_transposed_auto transform_direction_transposed CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 87358877e3c..442c32b3a3d 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -17,9 +17,7 @@ #ifndef __UTIL_TYPES_H__ #define __UTIL_TYPES_H__ -#ifndef __KERNEL_OPENCL__ -# include <stdlib.h> -#endif +#include <stdlib.h> /* Standard Integer Types */ @@ -44,18 +42,12 @@ CCL_NAMESPACE_BEGIN /* Shorter Unsigned Names */ -#ifndef __KERNEL_OPENCL__ typedef unsigned char uchar; typedef unsigned int uint; typedef unsigned short ushort; -#endif /* Fixed Bits Types */ -#ifdef __KERNEL_OPENCL__ -typedef unsigned long uint64_t; -#endif - #ifndef __KERNEL_GPU__ /* Generic Memory Pointer */ diff --git a/intern/cycles/util/util_unique_ptr.h b/intern/cycles/util/util_unique_ptr.h index 3aaaf083eff..3181eafd43d 100644 --- a/intern/cycles/util/util_unique_ptr.h +++ b/intern/cycles/util/util_unique_ptr.h @@ -21,6 +21,7 @@ CCL_NAMESPACE_BEGIN +using std::make_unique; using std::unique_ptr; CCL_NAMESPACE_END |