Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/CMakeLists.txt24
-rw-r--r--intern/cycles/app/CMakeLists.txt18
-rw-r--r--intern/cycles/app/cycles_standalone.cpp8
-rw-r--r--intern/cycles/app/cycles_xml.cpp2
-rw-r--r--intern/cycles/blender/CMakeLists.txt10
-rw-r--r--intern/cycles/blender/addon/__init__.py9
-rw-r--r--intern/cycles/blender/addon/engine.py132
-rw-r--r--intern/cycles/blender/addon/presets.py50
-rw-r--r--intern/cycles/blender/addon/properties.py499
-rw-r--r--intern/cycles/blender/addon/ui.py480
-rw-r--r--intern/cycles/blender/addon/version_update.py44
-rw-r--r--intern/cycles/blender/blender_camera.cpp13
-rw-r--r--intern/cycles/blender/blender_device.cpp11
-rw-r--r--intern/cycles/blender/blender_geometry.cpp6
-rw-r--r--intern/cycles/blender/blender_gpu_display.cpp787
-rw-r--r--intern/cycles/blender/blender_gpu_display.h215
-rw-r--r--intern/cycles/blender/blender_light.cpp18
-rw-r--r--intern/cycles/blender/blender_object.cpp2
-rw-r--r--intern/cycles/blender/blender_python.cpp262
-rw-r--r--intern/cycles/blender/blender_session.cpp702
-rw-r--r--intern/cycles/blender/blender_session.h59
-rw-r--r--intern/cycles/blender/blender_shader.cpp33
-rw-r--r--intern/cycles/blender/blender_sync.cpp468
-rw-r--r--intern/cycles/blender/blender_sync.h35
-rw-r--r--intern/cycles/blender/blender_viewport.cpp43
-rw-r--r--intern/cycles/blender/blender_viewport.h5
-rw-r--r--intern/cycles/bvh/bvh_build.cpp18
-rw-r--r--intern/cycles/bvh/bvh_embree.cpp89
-rw-r--r--intern/cycles/bvh/bvh_params.h21
-rw-r--r--intern/cycles/cmake/external_libs.cmake3
-rw-r--r--intern/cycles/device/CMakeLists.txt110
-rw-r--r--intern/cycles/device/cpu/device.cpp64
-rw-r--r--intern/cycles/device/cpu/device.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl)27
-rw-r--r--intern/cycles/device/cpu/device_impl.cpp481
-rw-r--r--intern/cycles/device/cpu/device_impl.h99
-rw-r--r--intern/cycles/device/cpu/kernel.cpp61
-rw-r--r--intern/cycles/device/cpu/kernel.h111
-rw-r--r--intern/cycles/device/cpu/kernel_function.h124
-rw-r--r--intern/cycles/device/cpu/kernel_thread_globals.cpp85
-rw-r--r--intern/cycles/device/cpu/kernel_thread_globals.h57
-rw-r--r--intern/cycles/device/cuda/device.cpp (renamed from intern/cycles/device/device_cuda.cpp)51
-rw-r--r--intern/cycles/device/cuda/device.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl)29
-rw-r--r--intern/cycles/device/cuda/device_cuda.h270
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp2714
-rw-r--r--intern/cycles/device/cuda/device_impl.cpp1370
-rw-r--r--intern/cycles/device/cuda/device_impl.h155
-rw-r--r--intern/cycles/device/cuda/graphics_interop.cpp102
-rw-r--r--intern/cycles/device/cuda/graphics_interop.h66
-rw-r--r--intern/cycles/device/cuda/kernel.cpp69
-rw-r--r--intern/cycles/device/cuda/kernel.h56
-rw-r--r--intern/cycles/device/cuda/queue.cpp220
-rw-r--r--intern/cycles/device/cuda/queue.h67
-rw-r--r--intern/cycles/device/cuda/util.cpp61
-rw-r--r--intern/cycles/device/cuda/util.h65
-rw-r--r--intern/cycles/device/device.cpp476
-rw-r--r--intern/cycles/device/device.h366
-rw-r--r--intern/cycles/device/device_cpu.cpp1680
-rw-r--r--intern/cycles/device/device_denoise.cpp88
-rw-r--r--intern/cycles/device/device_denoise.h110
-rw-r--r--intern/cycles/device/device_denoising.cpp353
-rw-r--r--intern/cycles/device/device_denoising.h197
-rw-r--r--intern/cycles/device/device_graphics_interop.cpp (renamed from intern/cycles/kernel/kernels/opencl/kernel_path_init.cl)11
-rw-r--r--intern/cycles/device/device_graphics_interop.h55
-rw-r--r--intern/cycles/device/device_intern.h58
-rw-r--r--intern/cycles/device/device_kernel.cpp157
-rw-r--r--intern/cycles/device/device_kernel.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl)25
-rw-r--r--intern/cycles/device/device_memory.cpp7
-rw-r--r--intern/cycles/device/device_memory.h136
-rw-r--r--intern/cycles/device/device_multi.cpp826
-rw-r--r--intern/cycles/device/device_network.cpp812
-rw-r--r--intern/cycles/device/device_network.h490
-rw-r--r--intern/cycles/device/device_opencl.cpp245
-rw-r--r--intern/cycles/device/device_optix.cpp1936
-rw-r--r--intern/cycles/device/device_queue.cpp87
-rw-r--r--intern/cycles/device/device_queue.h113
-rw-r--r--intern/cycles/device/device_split_kernel.cpp389
-rw-r--r--intern/cycles/device/device_split_kernel.h145
-rw-r--r--intern/cycles/device/device_task.cpp182
-rw-r--r--intern/cycles/device/device_task.h188
-rw-r--r--intern/cycles/device/dummy/device.cpp (renamed from intern/cycles/device/device_dummy.cpp)24
-rw-r--r--intern/cycles/device/dummy/device.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl)21
-rw-r--r--intern/cycles/device/multi/device.cpp423
-rw-r--r--intern/cycles/device/multi/device.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl)21
-rw-r--r--intern/cycles/device/opencl/device_opencl.h658
-rw-r--r--intern/cycles/device/opencl/device_opencl_impl.cpp2113
-rw-r--r--intern/cycles/device/opencl/memory_manager.cpp264
-rw-r--r--intern/cycles/device/opencl/memory_manager.h105
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp1326
-rw-r--r--intern/cycles/device/optix/device.cpp105
-rw-r--r--intern/cycles/device/optix/device.h35
-rw-r--r--intern/cycles/device/optix/device_impl.cpp1573
-rw-r--r--intern/cycles/device/optix/device_impl.h186
-rw-r--r--intern/cycles/device/optix/queue.cpp144
-rw-r--r--intern/cycles/device/optix/queue.h39
-rw-r--r--intern/cycles/device/optix/util.h45
-rw-r--r--intern/cycles/graph/node.cpp2
-rw-r--r--intern/cycles/graph/node.h18
-rw-r--r--intern/cycles/integrator/CMakeLists.txt76
-rw-r--r--intern/cycles/integrator/adaptive_sampling.cpp71
-rw-r--r--intern/cycles/integrator/adaptive_sampling.h55
-rw-r--r--intern/cycles/integrator/denoiser.cpp204
-rw-r--r--intern/cycles/integrator/denoiser.h135
-rw-r--r--intern/cycles/integrator/denoiser_device.cpp106
-rw-r--r--intern/cycles/integrator/denoiser_device.h40
-rw-r--r--intern/cycles/integrator/denoiser_oidn.cpp628
-rw-r--r--intern/cycles/integrator/denoiser_oidn.h47
-rw-r--r--intern/cycles/integrator/denoiser_optix.cpp (renamed from intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl)26
-rw-r--r--intern/cycles/integrator/denoiser_optix.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl)21
-rw-r--r--intern/cycles/integrator/pass_accessor.cpp318
-rw-r--r--intern/cycles/integrator/pass_accessor.h160
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.cpp183
-rw-r--r--intern/cycles/integrator/pass_accessor_cpu.h77
-rw-r--r--intern/cycles/integrator/pass_accessor_gpu.cpp118
-rw-r--r--intern/cycles/integrator/pass_accessor_gpu.h68
-rw-r--r--intern/cycles/integrator/path_trace.cpp1144
-rw-r--r--intern/cycles/integrator/path_trace.h324
-rw-r--r--intern/cycles/integrator/path_trace_work.cpp203
-rw-r--r--intern/cycles/integrator/path_trace_work.h194
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.cpp281
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.h82
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp933
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.h165
-rw-r--r--intern/cycles/integrator/render_scheduler.cpp1187
-rw-r--r--intern/cycles/integrator/render_scheduler.h466
-rw-r--r--intern/cycles/integrator/shader_eval.cpp173
-rw-r--r--intern/cycles/integrator/shader_eval.h61
-rw-r--r--intern/cycles/integrator/tile.cpp108
-rw-r--r--intern/cycles/integrator/tile.h56
-rw-r--r--intern/cycles/integrator/work_balancer.cpp99
-rw-r--r--intern/cycles/integrator/work_balancer.h42
-rw-r--r--intern/cycles/integrator/work_tile_scheduler.cpp138
-rw-r--r--intern/cycles/integrator/work_tile_scheduler.h98
-rw-r--r--intern/cycles/kernel/CMakeLists.txt314
-rw-r--r--intern/cycles/kernel/bvh/bvh.h32
-rw-r--r--intern/cycles/kernel/bvh/bvh_embree.h21
-rw-r--r--intern/cycles/kernel/bvh/bvh_local.h8
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h10
-rw-r--r--intern/cycles/kernel/bvh/bvh_shadow_all.h105
-rw-r--r--intern/cycles/kernel/bvh/bvh_traversal.h26
-rw-r--r--intern/cycles/kernel/bvh/bvh_types.h5
-rw-r--r--intern/cycles/kernel/bvh/bvh_util.h110
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume.h13
-rw-r--r--intern/cycles/kernel/bvh/bvh_volume_all.h14
-rw-r--r--intern/cycles/kernel/closure/alloc.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf.h91
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h25
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h15
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse.h13
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse_ramp.h5
-rw-r--r--intern/cycles/kernel/closure/bsdf_hair.h14
-rw-r--r--intern/cycles/kernel/closure/bsdf_hair_principled.h25
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h31
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h6
-rw-r--r--intern/cycles/kernel/closure/bsdf_oren_nayar.h13
-rw-r--r--intern/cycles/kernel/closure/bsdf_phong_ramp.h5
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_diffuse.h15
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_sheen.h7
-rw-r--r--intern/cycles/kernel/closure/bsdf_reflection.h5
-rw-r--r--intern/cycles/kernel/closure/bsdf_refraction.h5
-rw-r--r--intern/cycles/kernel/closure/bsdf_toon.h14
-rw-r--r--intern/cycles/kernel/closure/bsdf_transparent.h5
-rw-r--r--intern/cycles/kernel/closure/bsdf_util.h5
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h406
-rw-r--r--intern/cycles/kernel/closure/emissive.h2
-rw-r--r--intern/cycles/kernel/closure/volume.h109
-rw-r--r--intern/cycles/kernel/device/cpu/compat.h (renamed from intern/cycles/kernel/kernel_compat_cpu.h)59
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h61
-rw-r--r--intern/cycles/kernel/device/cpu/image.h (renamed from intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h)9
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel.cpp)4
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h (renamed from intern/cycles/kernel/kernel.h)25
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h113
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h235
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel_avx.cpp)4
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx2.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp)4
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse2.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp)4
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse3.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp)4
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse41.cpp (renamed from intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp)4
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h (renamed from intern/cycles/kernel/kernel_compat_cuda.h)139
-rw-r--r--intern/cycles/kernel/device/cuda/config.h114
-rw-r--r--intern/cycles/kernel/device/cuda/globals.h48
-rw-r--r--intern/cycles/kernel/device/cuda/kernel.cu (renamed from intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl)18
-rw-r--r--intern/cycles/kernel/device/gpu/image.h (renamed from intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h)55
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h843
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h83
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h46
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_reduce.h83
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h49
-rw-r--r--intern/cycles/kernel/device/optix/compat.h (renamed from intern/cycles/kernel/kernel_compat_optix.h)90
-rw-r--r--intern/cycles/kernel/device/optix/globals.h59
-rw-r--r--intern/cycles/kernel/device/optix/kernel.cu (renamed from intern/cycles/kernel/kernels/optix/kernel_optix.cu)168
-rw-r--r--intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu29
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h72
-rw-r--r--intern/cycles/kernel/filter/filter_features.h156
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h118
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h254
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h255
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h303
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h140
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h120
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h129
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h129
-rw-r--r--intern/cycles/kernel/geom/geom.h3
-rw-r--r--intern/cycles/kernel/geom/geom_attribute.h12
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h21
-rw-r--r--intern/cycles/kernel/geom/geom_curve_intersect.h68
-rw-r--r--intern/cycles/kernel/geom/geom_motion_curve.h12
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle.h12
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle_intersect.h76
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle_shader.h16
-rw-r--r--intern/cycles/kernel/geom/geom_object.h243
-rw-r--r--intern/cycles/kernel/geom/geom_patch.h20
-rw-r--r--intern/cycles/kernel/geom/geom_primitive.h39
-rw-r--r--intern/cycles/kernel/geom/geom_shader_data.h373
-rw-r--r--intern/cycles/kernel/geom/geom_subd_triangle.h29
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h37
-rw-r--r--intern/cycles/kernel/geom/geom_triangle_intersect.h81
-rw-r--r--intern/cycles/kernel/geom/geom_volume.h6
-rw-r--r--intern/cycles/kernel/integrator/integrator_init_from_bake.h182
-rw-r--r--intern/cycles/kernel/integrator/integrator_init_from_camera.h120
-rw-r--r--intern/cycles/kernel/integrator/integrator_intersect_closest.h248
-rw-r--r--intern/cycles/kernel/integrator/integrator_intersect_shadow.h144
-rw-r--r--intern/cycles/kernel/integrator/integrator_intersect_subsurface.h (renamed from intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl)27
-rw-r--r--intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h198
-rw-r--r--intern/cycles/kernel/integrator/integrator_megakernel.h93
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_background.h215
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_light.h126
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_shadow.h182
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_surface.h502
-rw-r--r--intern/cycles/kernel/integrator/integrator_shade_volume.h1019
-rw-r--r--intern/cycles/kernel/integrator/integrator_state.h185
-rw-r--r--intern/cycles/kernel/integrator/integrator_state_flow.h144
-rw-r--r--intern/cycles/kernel/integrator/integrator_state_template.h163
-rw-r--r--intern/cycles/kernel/integrator/integrator_state_util.h273
-rw-r--r--intern/cycles/kernel/integrator/integrator_subsurface.h623
-rw-r--r--intern/cycles/kernel/integrator/integrator_volume_stack.h223
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h972
-rw-r--r--intern/cycles/kernel/kernel_adaptive_sampling.h274
-rw-r--r--intern/cycles/kernel/kernel_bake.h514
-rw-r--r--intern/cycles/kernel/kernel_camera.h72
-rw-r--r--intern/cycles/kernel/kernel_color.h9
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h177
-rw-r--r--intern/cycles/kernel/kernel_differential.h73
-rw-r--r--intern/cycles/kernel/kernel_emission.h374
-rw-r--r--intern/cycles/kernel/kernel_film.h567
-rw-r--r--intern/cycles/kernel/kernel_globals.h248
-rw-r--r--intern/cycles/kernel/kernel_id_passes.h35
-rw-r--r--intern/cycles/kernel/kernel_jitter.h252
-rw-r--r--intern/cycles/kernel/kernel_light.h406
-rw-r--r--intern/cycles/kernel/kernel_light_background.h25
-rw-r--r--intern/cycles/kernel/kernel_light_common.h6
-rw-r--r--intern/cycles/kernel/kernel_lookup_table.h56
-rw-r--r--intern/cycles/kernel/kernel_math.h5
-rw-r--r--intern/cycles/kernel/kernel_montecarlo.h5
-rw-r--r--intern/cycles/kernel/kernel_passes.h414
-rw-r--r--intern/cycles/kernel/kernel_path.h709
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h556
-rw-r--r--intern/cycles/kernel/kernel_path_common.h48
-rw-r--r--intern/cycles/kernel/kernel_path_state.h383
-rw-r--r--intern/cycles/kernel/kernel_path_subsurface.h139
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h360
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h260
-rw-r--r--intern/cycles/kernel/kernel_profiling.h24
-rw-r--r--intern/cycles/kernel/kernel_projection.h5
-rw-r--r--intern/cycles/kernel/kernel_queues.h147
-rw-r--r--intern/cycles/kernel/kernel_random.h228
-rw-r--r--intern/cycles/kernel/kernel_shader.h1043
-rw-r--r--intern/cycles/kernel/kernel_shadow.h466
-rw-r--r--intern/cycles/kernel/kernel_shadow_catcher.h116
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h724
-rw-r--r--intern/cycles/kernel/kernel_textures.h2
-rw-r--r--intern/cycles/kernel/kernel_types.h1030
-rw-r--r--intern/cycles/kernel/kernel_volume.h1440
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h87
-rw-r--r--intern/cycles/kernel/kernel_write_passes.h53
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h143
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h331
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp38
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h100
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h232
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split.cpp62
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp41
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp42
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp38
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu413
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu232
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_config.h121
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu156
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl321
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_background.cl35
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_bake.cl36
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_base.cl88
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_data_init.cl53
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_displace.cl36
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h358
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl24
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl24
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl27
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl24
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl24
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl34
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_function.h67
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl24
-rw-r--r--intern/cycles/kernel/osl/background.cpp2
-rw-r--r--intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp2
-rw-r--r--intern/cycles/kernel/osl/bsdf_phong_ramp.cpp2
-rw-r--r--intern/cycles/kernel/osl/emissive.cpp2
-rw-r--r--intern/cycles/kernel/osl/osl_bssrdf.cpp40
-rw-r--r--intern/cycles/kernel/osl/osl_closures.cpp8
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp158
-rw-r--r--intern/cycles/kernel/osl/osl_services.h16
-rw-r--r--intern/cycles/kernel/osl/osl_shader.cpp40
-rw-r--r--intern/cycles/kernel/osl/osl_shader.h26
-rw-r--r--intern/cycles/kernel/shaders/node_principled_bsdf.osl31
-rw-r--r--intern/cycles/kernel/shaders/node_subsurface_scattering.osl25
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h43
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_filter_x.h30
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_filter_y.h29
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_stopping.h37
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h231
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h154
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h115
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h152
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h227
-rw-r--r--intern/cycles/kernel/split/kernel_enqueue_inactive.h46
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h149
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_background.h69
-rw-r--r--intern/cycles/kernel/split/kernel_indirect_subsurface.h67
-rw-r--r--intern/cycles/kernel/split/kernel_lamp_emission.h67
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h258
-rw-r--r--intern/cycles/kernel/split/kernel_path_init.h78
-rw-r--r--intern/cycles/kernel/split/kernel_queue_enqueue.h87
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h83
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h69
-rw-r--r--intern/cycles/kernel/split/kernel_shader_setup.h74
-rw-r--r--intern/cycles/kernel/split/kernel_shader_sort.h97
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_ao.h59
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h98
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h106
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h77
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h180
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h264
-rw-r--r--intern/cycles/kernel/svm/svm.h227
-rw-r--r--intern/cycles/kernel/svm/svm_ao.h53
-rw-r--r--intern/cycles/kernel/svm/svm_aov.h42
-rw-r--r--intern/cycles/kernel/svm/svm_attribute.h57
-rw-r--r--intern/cycles/kernel/svm/svm_bevel.h145
-rw-r--r--intern/cycles/kernel/svm/svm_blackbody.h7
-rw-r--r--intern/cycles/kernel/svm/svm_brick.h11
-rw-r--r--intern/cycles/kernel/svm/svm_brightness.h2
-rw-r--r--intern/cycles/kernel/svm/svm_bump.h16
-rw-r--r--intern/cycles/kernel/svm/svm_camera.h12
-rw-r--r--intern/cycles/kernel/svm/svm_checker.h5
-rw-r--r--intern/cycles/kernel/svm/svm_clamp.h17
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h121
-rw-r--r--intern/cycles/kernel/svm/svm_convert.h4
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h21
-rw-r--r--intern/cycles/kernel/svm/svm_fresnel.h4
-rw-r--r--intern/cycles/kernel/svm/svm_gamma.h2
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h24
-rw-r--r--intern/cycles/kernel/svm/svm_gradient.h2
-rw-r--r--intern/cycles/kernel/svm/svm_hsv.h6
-rw-r--r--intern/cycles/kernel/svm/svm_ies.h10
-rw-r--r--intern/cycles/kernel/svm/svm_image.h26
-rw-r--r--intern/cycles/kernel/svm/svm_invert.h2
-rw-r--r--intern/cycles/kernel/svm/svm_light_path.h50
-rw-r--r--intern/cycles/kernel/svm/svm_magic.h7
-rw-r--r--intern/cycles/kernel/svm/svm_map_range.h19
-rw-r--r--intern/cycles/kernel/svm/svm_mapping.h41
-rw-r--r--intern/cycles/kernel/svm/svm_math.h30
-rw-r--r--intern/cycles/kernel/svm/svm_mix.h17
-rw-r--r--intern/cycles/kernel/svm/svm_musgrave.h19
-rw-r--r--intern/cycles/kernel/svm/svm_noise.h10
-rw-r--r--intern/cycles/kernel/svm/svm_noisetex.h19
-rw-r--r--intern/cycles/kernel/svm/svm_normal.h17
-rw-r--r--intern/cycles/kernel/svm/svm_ramp.h34
-rw-r--r--intern/cycles/kernel/svm/svm_sepcomb_hsv.h34
-rw-r--r--intern/cycles/kernel/svm/svm_sky.h33
-rw-r--r--intern/cycles/kernel/svm/svm_tex_coord.h55
-rw-r--r--intern/cycles/kernel/svm/svm_types.h43
-rw-r--r--intern/cycles/kernel/svm/svm_value.h9
-rw-r--r--intern/cycles/kernel/svm/svm_vector_rotate.h10
-rw-r--r--intern/cycles/kernel/svm/svm_vector_transform.h8
-rw-r--r--intern/cycles/kernel/svm/svm_vertex_color.h48
-rw-r--r--intern/cycles/kernel/svm/svm_voronoi.h148
-rw-r--r--intern/cycles/kernel/svm/svm_voxel.h11
-rw-r--r--intern/cycles/kernel/svm/svm_wave.h9
-rw-r--r--intern/cycles/kernel/svm/svm_wavelength.h4
-rw-r--r--intern/cycles/kernel/svm/svm_white_noise.h13
-rw-r--r--intern/cycles/kernel/svm/svm_wireframe.h18
-rw-r--r--intern/cycles/render/CMakeLists.txt7
-rw-r--r--intern/cycles/render/background.cpp12
-rw-r--r--intern/cycles/render/background.h4
-rw-r--r--intern/cycles/render/bake.cpp112
-rw-r--r--intern/cycles/render/bake.h6
-rw-r--r--intern/cycles/render/buffers.cpp674
-rw-r--r--intern/cycles/render/buffers.h257
-rw-r--r--intern/cycles/render/camera.cpp19
-rw-r--r--intern/cycles/render/camera.h3
-rw-r--r--intern/cycles/render/coverage.cpp155
-rw-r--r--intern/cycles/render/coverage.h52
-rw-r--r--intern/cycles/render/denoising.cpp31
-rw-r--r--intern/cycles/render/denoising.h35
-rw-r--r--intern/cycles/render/film.cpp726
-rw-r--r--intern/cycles/render/film.h55
-rw-r--r--intern/cycles/render/geometry.cpp14
-rw-r--r--intern/cycles/render/gpu_display.cpp227
-rw-r--r--intern/cycles/render/gpu_display.h247
-rw-r--r--intern/cycles/render/graph.h15
-rw-r--r--intern/cycles/render/integrator.cpp214
-rw-r--r--intern/cycles/render/integrator.h36
-rw-r--r--intern/cycles/render/jitter.cpp6
-rw-r--r--intern/cycles/render/light.cpp140
-rw-r--r--intern/cycles/render/light.h5
-rw-r--r--intern/cycles/render/mesh_displace.cpp165
-rw-r--r--intern/cycles/render/nodes.cpp80
-rw-r--r--intern/cycles/render/nodes.h267
-rw-r--r--intern/cycles/render/object.cpp20
-rw-r--r--intern/cycles/render/osl.cpp58
-rw-r--r--intern/cycles/render/pass.cpp427
-rw-r--r--intern/cycles/render/pass.h106
-rw-r--r--intern/cycles/render/scene.cpp189
-rw-r--r--intern/cycles/render/scene.h48
-rw-r--r--intern/cycles/render/session.cpp1294
-rw-r--r--intern/cycles/render/session.h225
-rw-r--r--intern/cycles/render/shader.cpp60
-rw-r--r--intern/cycles/render/shader.h7
-rw-r--r--intern/cycles/render/stats.cpp73
-rw-r--r--intern/cycles/render/svm.cpp17
-rw-r--r--intern/cycles/render/svm.h3
-rw-r--r--intern/cycles/render/tile.cpp934
-rw-r--r--intern/cycles/render/tile.h236
-rw-r--r--intern/cycles/test/CMakeLists.txt5
-rw-r--r--intern/cycles/test/integrator_adaptive_sampling_test.cpp116
-rw-r--r--intern/cycles/test/integrator_render_scheduler_test.cpp37
-rw-r--r--intern/cycles/test/integrator_tile_test.cpp47
-rw-r--r--intern/cycles/test/render_graph_finalize_test.cpp2
-rw-r--r--intern/cycles/test/util_math_test.cpp61
-rw-r--r--intern/cycles/test/util_string_test.cpp36
-rw-r--r--intern/cycles/util/util_atomic.h50
-rw-r--r--intern/cycles/util/util_debug.cpp83
-rw-r--r--intern/cycles/util/util_debug.h67
-rw-r--r--intern/cycles/util/util_defines.h4
-rw-r--r--intern/cycles/util/util_half.h46
-rw-r--r--intern/cycles/util/util_logging.h1
-rw-r--r--intern/cycles/util/util_math.h97
-rw-r--r--intern/cycles/util/util_math_float2.h5
-rw-r--r--intern/cycles/util/util_math_float3.h128
-rw-r--r--intern/cycles/util/util_math_float4.h145
-rw-r--r--intern/cycles/util/util_math_int2.h4
-rw-r--r--intern/cycles/util/util_math_int3.h40
-rw-r--r--intern/cycles/util/util_path.cpp184
-rw-r--r--intern/cycles/util/util_path.h8
-rw-r--r--intern/cycles/util/util_profiling.cpp8
-rw-r--r--intern/cycles/util/util_profiling.h106
-rw-r--r--intern/cycles/util/util_progress.h22
-rw-r--r--intern/cycles/util/util_simd.h14
-rw-r--r--intern/cycles/util/util_static_assert.h4
-rw-r--r--intern/cycles/util/util_string.cpp36
-rw-r--r--intern/cycles/util/util_string.h12
-rw-r--r--intern/cycles/util/util_system.cpp9
-rw-r--r--intern/cycles/util/util_system.h3
-rw-r--r--intern/cycles/util/util_tbb.h1
-rw-r--r--intern/cycles/util/util_texture.h2
-rw-r--r--intern/cycles/util/util_transform.h34
-rw-r--r--intern/cycles/util/util_types.h10
-rw-r--r--intern/cycles/util/util_unique_ptr.h1
482 files changed, 33247 insertions, 42365 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 381248e9bf1..17096d441f0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -247,7 +247,7 @@ if(WITH_CYCLES_OSL)
endif()
if(WITH_CYCLES_DEVICE_OPTIX)
- find_package(OptiX)
+ find_package(OptiX 7.3.0)
if(OPTIX_FOUND)
add_definitions(-DWITH_OPTIX)
@@ -286,11 +286,17 @@ if(WITH_OPENSUBDIV)
)
endif()
+if(WITH_OPENIMAGEDENOISE)
+ add_definitions(-DWITH_OPENIMAGEDENOISE)
+ add_definitions(-DOIDN_STATIC_LIB)
+ include_directories(
+ SYSTEM
+ ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+ )
+endif()
+
if(WITH_CYCLES_STANDALONE)
- set(WITH_CYCLES_DEVICE_OPENCL TRUE)
set(WITH_CYCLES_DEVICE_CUDA TRUE)
- # Experimental and unfinished.
- set(WITH_CYCLES_NETWORK FALSE)
endif()
# TODO(sergey): Consider removing it, only causes confusion in interface.
set(WITH_CYCLES_DEVICE_MULTI TRUE)
@@ -386,18 +392,12 @@ if(WITH_CYCLES_BLENDER)
add_subdirectory(blender)
endif()
-if(WITH_CYCLES_NETWORK)
- add_definitions(-DWITH_NETWORK)
-endif()
-
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
- add_subdirectory(app)
-endif()
-
+add_subdirectory(app)
add_subdirectory(bvh)
add_subdirectory(device)
add_subdirectory(doc)
add_subdirectory(graph)
+add_subdirectory(integrator)
add_subdirectory(kernel)
add_subdirectory(render)
add_subdirectory(subd)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 7a1e5d62dd2..f9dc5f00802 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -91,24 +91,6 @@ if(WITH_CYCLES_STANDALONE)
endif()
#####################################################################
-# Cycles network server executable
-#####################################################################
-
-if(WITH_CYCLES_NETWORK)
- set(SRC
- cycles_server.cpp
- )
- add_executable(cycles_server ${SRC})
- target_link_libraries(cycles_server ${LIBRARIES})
- cycles_target_link_libraries(cycles_server)
-
- if(UNIX AND NOT APPLE)
- set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib)
- endif()
- unset(SRC)
-endif()
-
-#####################################################################
# Cycles cubin compiler executable
#####################################################################
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 6b3513b065a..270096d70b0 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -126,7 +126,7 @@ static BufferParams &session_buffer_params()
static void scene_init()
{
- options.scene = new Scene(options.scene_params, options.session->device);
+ options.scene = options.session->scene;
/* Read XML */
xml_read_file(options.scene, options.filepath.c_str());
@@ -148,7 +148,7 @@ static void scene_init()
static void session_init()
{
options.session_params.write_render_cb = write_render;
- options.session = new Session(options.session_params);
+ options.session = new Session(options.session_params, options.scene_params);
if (options.session_params.background && !options.quiet)
options.session->progress.set_update_callback(function_bind(&session_print_status));
@@ -159,7 +159,6 @@ static void session_init()
/* load scene */
scene_init();
- options.session->scene = options.scene;
options.session->reset(session_buffer_params(), options.session_params.samples);
options.session->start();
@@ -527,9 +526,6 @@ static void options_parse(int argc, const char **argv)
fprintf(stderr, "No file path specified\n");
exit(EXIT_FAILURE);
}
-
- /* For smoother Viewport */
- options.session_params.start_resolution = 64;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 276d850f1b3..54f97fddbd9 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -703,7 +703,7 @@ void xml_read_file(Scene *scene, const char *filepath)
xml_read_include(state, path_filename(filepath));
- scene->params.bvh_type = SceneParams::BVH_STATIC;
+ scene->params.bvh_type = BVH_TYPE_STATIC;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index ee5c6157338..5bdcfd56a4d 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SRC
blender_device.cpp
blender_image.cpp
blender_geometry.cpp
+ blender_gpu_display.cpp
blender_light.cpp
blender_mesh.cpp
blender_object.cpp
@@ -50,6 +51,7 @@ set(SRC
CCL_api.h
blender_device.h
+ blender_gpu_display.h
blender_id_map.h
blender_image.h
blender_object_cull.h
@@ -93,14 +95,6 @@ set(ADDON_FILES
add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_DEVICE_OPENCL)
- add_definitions(-DWITH_OPENCL)
-endif()
-
-if(WITH_CYCLES_NETWORK)
- add_definitions(-DWITH_NETWORK)
-endif()
-
if(WITH_MOD_FLUID)
add_definitions(-DWITH_FLUID)
endif()
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index f728050a3cf..1ce25a253f9 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,6 @@ class CyclesRender(bpy.types.RenderEngine):
bl_use_eevee_viewport = True
bl_use_preview = True
bl_use_exclude_layers = True
- bl_use_save_buffers = True
bl_use_spherical_stereo = True
bl_use_custom_freestyle = True
bl_use_alembic_procedural = True
@@ -85,6 +84,12 @@ class CyclesRender(bpy.types.RenderEngine):
def render(self, depsgraph):
engine.render(self, depsgraph)
+ def render_frame_finish(self):
+ engine.render_frame_finish(self)
+
+ def draw(self, context, depsgraph):
+ engine.draw(self, depsgraph, context.space_data)
+
def bake(self, depsgraph, obj, pass_type, pass_filter, width, height):
engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height)
@@ -98,7 +103,7 @@ class CyclesRender(bpy.types.RenderEngine):
engine.sync(self, depsgraph, context.blend_data)
def view_draw(self, context, depsgraph):
- engine.draw(self, depsgraph, context.region, context.space_data, context.region_data)
+ engine.view_draw(self, depsgraph, context.region, context.space_data, context.region_data)
def update_script_node(self, node):
if engine.with_osl():
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 489a883f098..e0e8ca10bef 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -18,62 +18,17 @@
from __future__ import annotations
-def _is_using_buggy_driver():
- import gpu
- # We need to be conservative here because in multi-GPU systems display card
- # might be quite old, but others one might be just good.
- #
- # So We shouldn't disable possible good dedicated cards just because display
- # card seems weak. And instead we only blacklist configurations which are
- # proven to cause problems.
- if gpu.platform.vendor_get() == "ATI Technologies Inc.":
- import re
- version = gpu.platform.version_get()
- if version.endswith("Compatibility Profile Context"):
- # Old HD 4xxx and 5xxx series drivers did not have driver version
- # in the version string, but those cards do not quite work and
- # causing crashes.
- return True
- regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
- if not regex.match(version):
- # Skip cards like FireGL
- return False
- version = regex.sub("\\1", version).split('.')
- return int(version[0]) == 8
- return False
-
-
-def _workaround_buggy_drivers():
- if _is_using_buggy_driver():
- import _cycles
- if hasattr(_cycles, "opencl_disable"):
- print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.")
- _cycles.opencl_disable()
-
-
def _configure_argument_parser():
import argparse
# No help because it conflicts with general Python scripts argument parsing
parser = argparse.ArgumentParser(description="Cycles Addon argument parser",
add_help=False)
- parser.add_argument("--cycles-resumable-num-chunks",
- help="Number of chunks to split sample range into",
- default=None)
- parser.add_argument("--cycles-resumable-current-chunk",
- help="Current chunk of samples range to render",
- default=None)
- parser.add_argument("--cycles-resumable-start-chunk",
- help="Start chunk to render",
- default=None)
- parser.add_argument("--cycles-resumable-end-chunk",
- help="End chunk to render",
- default=None)
parser.add_argument("--cycles-print-stats",
help="Print rendering statistics to stderr",
action='store_true')
parser.add_argument("--cycles-device",
help="Set the device to use for Cycles, overriding user preferences and the scene setting."
- "Valid options are 'CPU', 'CUDA', 'OPTIX' or 'OPENCL'."
+ "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
"Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
default=None)
return parser
@@ -89,21 +44,6 @@ def _parse_command_line():
parser = _configure_argument_parser()
args, _ = parser.parse_known_args(argv[argv.index("--") + 1:])
- if args.cycles_resumable_num_chunks is not None:
- if args.cycles_resumable_current_chunk is not None:
- import _cycles
- _cycles.set_resumable_chunk(
- int(args.cycles_resumable_num_chunks),
- int(args.cycles_resumable_current_chunk),
- )
- elif args.cycles_resumable_start_chunk is not None and \
- args.cycles_resumable_end_chunk:
- import _cycles
- _cycles.set_resumable_chunk_range(
- int(args.cycles_resumable_num_chunks),
- int(args.cycles_resumable_start_chunk),
- int(args.cycles_resumable_end_chunk),
- )
if args.cycles_print_stats:
import _cycles
_cycles.enable_print_stats()
@@ -118,23 +58,11 @@ def init():
import _cycles
import os.path
- # Workaround possibly buggy legacy drivers which crashes on the OpenCL
- # device enumeration.
- #
- # This checks are not really correct because they might still fail
- # in the case of multiple GPUs. However, currently buggy drivers
- # are really old and likely to be used in single GPU systems only
- # anyway.
- #
- # Can't do it in the background mode, so we hope OpenCL is no enabled
- # in the user preferences.
- if not bpy.app.background:
- _workaround_buggy_drivers()
-
path = os.path.dirname(__file__)
user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
+ temp_path = bpy.app.tempdir
- _cycles.init(path, user_path, bpy.app.background)
+ _cycles.init(path, user_path, temp_path, bpy.app.background)
_parse_command_line()
@@ -177,6 +105,25 @@ def render(engine, depsgraph):
_cycles.render(engine.session, depsgraph.as_pointer())
+def render_frame_finish(engine):
+ if not engine.session:
+ return
+
+ import _cycles
+ _cycles.render_frame_finish(engine.session)
+
+def draw(engine, depsgraph, space_image):
+ if not engine.session:
+ return
+
+ depsgraph_ptr = depsgraph.as_pointer()
+ space_image_ptr = space_image.as_pointer()
+ screen_ptr = space_image.id_data.as_pointer()
+
+ import _cycles
+ _cycles.draw(engine.session, depsgraph_ptr, screen_ptr, space_image_ptr)
+
+
def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height):
import _cycles
session = getattr(engine, "session", None)
@@ -204,14 +151,14 @@ def sync(engine, depsgraph, data):
_cycles.sync(engine.session, depsgraph.as_pointer())
-def draw(engine, depsgraph, region, v3d, rv3d):
+def view_draw(engine, depsgraph, region, v3d, rv3d):
import _cycles
depsgraph = depsgraph.as_pointer()
v3d = v3d.as_pointer()
rv3d = rv3d.as_pointer()
# draw render image
- _cycles.draw(engine.session, depsgraph, v3d, rv3d)
+ _cycles.view_draw(engine.session, depsgraph, v3d, rv3d)
def available_devices():
@@ -224,11 +171,6 @@ def with_osl():
return _cycles.with_osl
-def with_network():
- import _cycles
- return _cycles.with_network
-
-
def system_info():
import _cycles
return _cycles.system_info()
@@ -243,6 +185,7 @@ def list_render_passes(scene, srl):
# Data passes.
if srl.use_pass_z: yield ("Depth", "Z", 'VALUE')
if srl.use_pass_mist: yield ("Mist", "Z", 'VALUE')
+ if srl.use_pass_position: yield ("Position", "XYZ", 'VECTOR')
if srl.use_pass_normal: yield ("Normal", "XYZ", 'VECTOR')
if srl.use_pass_vector: yield ("Vector", "XYZW", 'VECTOR')
if srl.use_pass_uv: yield ("UV", "UVA", 'VECTOR')
@@ -265,6 +208,7 @@ def list_render_passes(scene, srl):
if srl.use_pass_environment: yield ("Env", "RGB", 'COLOR')
if srl.use_pass_shadow: yield ("Shadow", "RGB", 'COLOR')
if srl.use_pass_ambient_occlusion: yield ("AO", "RGB", 'COLOR')
+ if crl.use_pass_shadow_catcher: yield ("Shadow Catcher", "RGB", 'COLOR')
# Debug passes.
if crl.pass_debug_render_time: yield ("Debug Render Time", "X", 'VALUE')
@@ -283,30 +227,20 @@ def list_render_passes(scene, srl):
yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
# Denoising passes.
- if (scene.cycles.use_denoising and crl.use_denoising) or crl.denoising_store_passes:
+ if scene.cycles.use_denoising and crl.use_denoising:
yield ("Noisy Image", "RGBA", 'COLOR')
- if crl.denoising_store_passes:
- yield ("Denoising Normal", "XYZ", 'VECTOR')
- yield ("Denoising Albedo", "RGB", 'COLOR')
- yield ("Denoising Depth", "Z", 'VALUE')
-
- if scene.cycles.denoiser == 'NLM':
- yield ("Denoising Shadowing", "X", 'VALUE')
- yield ("Denoising Variance", "RGB", 'COLOR')
- yield ("Denoising Intensity", "X", 'VALUE')
-
- clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
- "denoising_glossy_direct", "denoising_glossy_indirect",
- "denoising_transmission_direct", "denoising_transmission_indirect")
- if any(getattr(crl, option) for option in clean_options):
- yield ("Denoising Clean", "RGB", 'COLOR')
+ if crl.use_pass_shadow_catcher:
+ yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+ if crl.denoising_store_passes:
+ yield ("Denoising Normal", "XYZ", 'VECTOR')
+ yield ("Denoising Albedo", "RGB", 'COLOR')
# Custom AOV passes.
for aov in srl.aovs:
if aov.type == 'VALUE':
yield (aov.name, "X", 'VALUE')
else:
- yield (aov.name, "RGBA", 'COLOR')
+ yield (aov.name, "RGB", 'COLOR')
def register_passes(engine, scene, view_layer):
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index bf33e5dc010..37c39904e30 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -60,32 +60,48 @@ class AddPresetSampling(AddPresetBase, Operator):
]
preset_values = [
+ "cycles.use_adaptive_sampling",
"cycles.samples",
- "cycles.preview_samples",
- "cycles.aa_samples",
- "cycles.preview_aa_samples",
- "cycles.diffuse_samples",
- "cycles.glossy_samples",
- "cycles.transmission_samples",
- "cycles.ao_samples",
- "cycles.mesh_light_samples",
- "cycles.subsurface_samples",
- "cycles.volume_samples",
- "cycles.use_square_samples",
- "cycles.progressive",
- "cycles.seed",
- "cycles.sample_clamp_direct",
- "cycles.sample_clamp_indirect",
- "cycles.sample_all_lights_direct",
- "cycles.sample_all_lights_indirect",
+ "cycles.adaptive_threshold",
+ "cycles.adaptive_min_samples",
+ "cycles.time_limit",
+ "cycles.use_denoising",
+ "cycles.denoiser",
+ "cycles.denoising_input_passes",
+ "cycles.denoising_prefilter",
]
preset_subdir = "cycles/sampling"
+class AddPresetViewportSampling(AddPresetBase, Operator):
+ '''Add a Viewport Sampling Preset'''
+ bl_idname = "render.cycles_viewport_sampling_preset_add"
+ bl_label = "Add Viewport Sampling Preset"
+ preset_menu = "CYCLES_PT_viewport_sampling_presets"
+
+ preset_defines = [
+ "cycles = bpy.context.scene.cycles"
+ ]
+
+ preset_values = [
+ "cycles.use_preview_adaptive_sampling",
+ "cycles.preview_samples",
+ "cycles.preview_adaptive_threshold",
+ "cycles.preview_adaptive_min_samples",
+ "cycles.use_preview_denoising",
+ "cycles.preview_denoiser",
+ "cycles.preview_denoising_input_passes",
+ "cycles.preview_denoising_prefilter",
+ "cycles.preview_denoising_start_sample",
+ ]
+
+ preset_subdir = "cycles/viewport_sampling"
+
classes = (
AddPresetIntegrator,
AddPresetSampling,
+ AddPresetViewportSampling,
)
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0c3af3fabeb..c2570e71efd 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -39,11 +39,6 @@ enum_devices = (
('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
)
-from _cycles import with_network
-if with_network:
- enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),)
-del with_network
-
enum_feature_set = (
('SUPPORTED', "Supported", "Only use finished and supported features"),
('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
@@ -84,15 +79,6 @@ enum_curve_shape = (
('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
)
-enum_tile_order = (
- ('CENTER', "Center", "Render from center to the edges"),
- ('RIGHT_TO_LEFT', "Right to Left", "Render from right to left"),
- ('LEFT_TO_RIGHT', "Left to Right", "Render from left to right"),
- ('TOP_TO_BOTTOM', "Top to Bottom", "Render from top to bottom"),
- ('BOTTOM_TO_TOP', "Bottom to Top", "Render from bottom to top"),
- ('HILBERT_SPIRAL', "Hilbert Spiral", "Render in a Hilbert Spiral"),
-)
-
enum_use_layer_samples = (
('USE', "Use", "Per render layer number of samples override scene samples"),
('BOUNDED', "Bounded", "Bound per render layer number of samples by global samples"),
@@ -101,15 +87,9 @@ enum_use_layer_samples = (
enum_sampling_pattern = (
('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
- ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
)
-enum_integrator = (
- ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"),
- ('PATH', "Path Tracing", "Pure path tracing integrator"),
-)
-
enum_volume_sampling = (
('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
@@ -131,7 +111,6 @@ enum_device_type = (
('CPU', "CPU", "CPU", 0),
('CUDA', "CUDA", "CUDA", 1),
('OPTIX', "OptiX", "OptiX", 3),
- ('OPENCL', "OpenCL", "OpenCL", 2)
)
enum_texture_limit = (
@@ -144,39 +123,46 @@ enum_texture_limit = (
('4096', "4096", "Limit texture size to 4096 pixels", 6),
('8192', "8192", "Limit texture size to 8192 pixels", 7),
)
-
+
+# NOTE: Identifiers are expected to be an upper case version of identifiers from `Pass::get_type_enum()`
enum_view3d_shading_render_pass = (
('', "General", ""),
- ('COMBINED', "Combined", "Show the Combined Render pass", 1),
- ('EMISSION', "Emission", "Show the Emission render pass", 33),
- ('BACKGROUND', "Background", "Show the Background render pass", 34),
- ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35),
+ ('COMBINED', "Combined", "Show the Combined Render pass"),
+ ('EMISSION', "Emission", "Show the Emission render pass"),
+ ('BACKGROUND', "Background", "Show the Background render pass"),
+ ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"),
+ ('SHADOW', "Shadow", "Show the Shadow render pass"),
+ ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"),
('', "Light", ""),
- ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38),
- ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39),
- ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40),
+ ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass"),
+ ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass"),
+ ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass"),
- ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41),
- ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42),
- ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43),
+ ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass"),
+ ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass"),
+ ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass"),
('', "", ""),
- ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44),
- ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
- ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
+ ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass"),
+ ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass"),
+ ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass"),
- ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
- ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
+ ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass"),
+ ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass"),
('', "Data", ""),
- ('NORMAL', "Normal", "Show the Normal render pass", 3),
- ('UV', "UV", "Show the UV render pass", 4),
- ('MIST', "Mist", "Show the Mist render pass", 32),
+ ('POSITION', "Position", "Show the Position render pass"),
+ ('NORMAL', "Normal", "Show the Normal render pass"),
+ ('UV', "UV", "Show the UV render pass"),
+ ('MIST', "Mist", "Show the Mist render pass"),
+ ('DENOISING_ALBEDO', "Denoising Albedo", "Albedo pass used by denoiser"),
+ ('DENOISING_NORMAL', "Denoising Normal", "Normal pass used by denoiser"),
+ ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"),
)
@@ -208,18 +194,23 @@ def enum_preview_denoiser(self, context):
def enum_denoiser(self, context):
- items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+ items = []
items += enum_optix_denoiser(self, context)
items += enum_openimagedenoise_denoiser(self, context)
return items
enum_denoising_input_passes = (
- ('RGB', "Color", "Use only color as input", 1),
- ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
- ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
+ ('RGB', "None", "Don't use utility passes for denoising", 1),
+ ('RGB_ALBEDO', "Albedo", "Use albedo pass for denoising", 2),
+ ('RGB_ALBEDO_NORMAL', "Albedo and Normal", "Use albedo and normal passes for denoising", 3),
)
+enum_denoising_prefilter = (
+ ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1),
+ ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2),
+ ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
+)
def update_render_passes(self, context):
scene = context.scene
@@ -252,13 +243,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
description="Use Open Shading Language (CPU rendering only)",
)
- progressive: EnumProperty(
- name="Integrator",
- description="Method to sample lights and materials",
- items=enum_integrator,
- default='PATH',
- )
-
preview_pause: BoolProperty(
name="Pause Preview",
description="Pause all viewport preview renders",
@@ -268,110 +252,88 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
use_denoising: BoolProperty(
name="Use Denoising",
description="Denoise the rendered image",
- default=False,
+ default=True,
update=update_render_passes,
)
- use_preview_denoising: BoolProperty(
- name="Use Viewport Denoising",
- description="Denoise the image in the 3D viewport",
- default=False,
- )
-
denoiser: EnumProperty(
name="Denoiser",
description="Denoise the image with the selected denoiser. "
- "For denoising the image after rendering, denoising data render passes "
- "also adapt to the selected denoiser",
+ "For denoising the image after rendering",
items=enum_denoiser,
- default=1,
+ default=4, # Use integer to avoid error in builds without OpenImageDenoise.
update=update_render_passes,
)
+ denoising_prefilter: EnumProperty(
+ name="Denoising Prefilter",
+ description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+ items=enum_denoising_prefilter,
+ default='ACCURATE',
+ )
+ denoising_input_passes: EnumProperty(
+ name="Denoising Input Passes",
+ description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+ items=enum_denoising_input_passes,
+ default='RGB_ALBEDO_NORMAL',
+ )
+
+ use_preview_denoising: BoolProperty(
+ name="Use Viewport Denoising",
+ description="Denoise the image in the 3D viewport",
+ default=False,
+ )
preview_denoiser: EnumProperty(
name="Viewport Denoiser",
description="Denoise the image after each preview update with the selected denoiser",
items=enum_preview_denoiser,
default=0,
)
-
- use_square_samples: BoolProperty(
- name="Square Samples",
- description="Square sampling values for easier artist control",
- default=False,
+ preview_denoising_prefilter: EnumProperty(
+ name="Viewport Denoising Prefilter",
+ description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+ items=enum_denoising_prefilter,
+ default='FAST',
+ )
+ preview_denoising_input_passes: EnumProperty(
+ name="Viewport Denoising Input Passes",
+ description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+ items=enum_denoising_input_passes,
+ default='RGB_ALBEDO',
+ )
+ preview_denoising_start_sample: IntProperty(
+ name="Start Denoising",
+ description="Sample to start denoising the preview at",
+ min=0, max=(1 << 24),
+ default=1,
)
samples: IntProperty(
name="Samples",
description="Number of samples to render for each pixel",
min=1, max=(1 << 24),
- default=128,
+ default=4096,
)
preview_samples: IntProperty(
name="Viewport Samples",
description="Number of samples to render in the viewport, unlimited if 0",
min=0, max=(1 << 24),
- default=32,
- )
- aa_samples: IntProperty(
- name="AA Samples",
- description="Number of antialiasing samples to render for each pixel",
- min=1, max=2097151,
- default=128,
- )
- preview_aa_samples: IntProperty(
- name="AA Samples",
- description="Number of antialiasing samples to render in the viewport, unlimited if 0",
- min=0, max=2097151,
- default=32,
+ default=1024,
)
- diffuse_samples: IntProperty(
- name="Diffuse Samples",
- description="Number of diffuse bounce samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- glossy_samples: IntProperty(
- name="Glossy Samples",
- description="Number of glossy bounce samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- transmission_samples: IntProperty(
- name="Transmission Samples",
- description="Number of transmission bounce samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- ao_samples: IntProperty(
- name="Ambient Occlusion Samples",
- description="Number of ambient occlusion samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- mesh_light_samples: IntProperty(
- name="Mesh Light Samples",
- description="Number of mesh emission light samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- subsurface_samples: IntProperty(
- name="Subsurface Samples",
- description="Number of subsurface scattering samples to render for each AA sample",
- min=1, max=1024,
- default=1,
- )
- volume_samples: IntProperty(
- name="Volume Samples",
- description="Number of volume scattering samples to render for each AA sample",
- min=1, max=1024,
- default=1,
+ time_limit: FloatProperty(
+ name="Time Limit",
+ description="Limit the render time (excluding synchronization time)."
+ "Zero disables the limit",
+ min=0.0,
+ default=0.0,
+ step=100.0,
+ unit='TIME_ABSOLUTE',
)
sampling_pattern: EnumProperty(
name="Sampling Pattern",
description="Random sampling pattern used by the integrator",
items=enum_sampling_pattern,
- default='SOBOL',
+ default='PROGRESSIVE_MUTI_JITTER',
)
use_layer_samples: EnumProperty(
@@ -381,17 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default='USE',
)
- sample_all_lights_direct: BoolProperty(
- name="Sample All Direct Lights",
- description="Sample all lights (for direct samples), rather than randomly picking one",
- default=True,
- )
-
- sample_all_lights_indirect: BoolProperty(
- name="Sample All Indirect Lights",
- description="Sample all lights (for indirect samples), rather than randomly picking one",
- default=True,
- )
light_sampling_threshold: FloatProperty(
name="Light Sampling Threshold",
description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
@@ -403,19 +354,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
use_adaptive_sampling: BoolProperty(
name="Use Adaptive Sampling",
description="Automatically reduce the number of samples per pixel based on estimated noise level",
- default=False,
+ default=True,
)
-
adaptive_threshold: FloatProperty(
name="Adaptive Sampling Threshold",
description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples",
min=0.0, max=1.0,
- default=0.0,
+ soft_min=0.001,
+ default=0.01,
precision=4,
)
adaptive_min_samples: IntProperty(
name="Adaptive Min Samples",
- description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+ description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold",
+ min=0, max=4096,
+ default=0,
+ )
+
+ use_preview_adaptive_sampling: BoolProperty(
+ name="Use Adaptive Sampling",
+ description="Automatically reduce the number of samples per pixel based on estimated noise level, for viewport renders",
+ default=True,
+ )
+ preview_adaptive_threshold: FloatProperty(
+ name="Adaptive Sampling Threshold",
+ description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders",
+ min=0.0, max=1.0,
+ soft_min=0.001,
+ default=0.1,
+ precision=4,
+ )
+ preview_adaptive_min_samples: IntProperty(
+ name="Adaptive Min Samples",
+ description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders",
min=0, max=4096,
default=0,
)
@@ -632,53 +603,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default=10.0,
)
- debug_tile_size: IntProperty(
- name="Tile Size",
- description="",
- min=1, max=4096,
- default=1024,
- )
-
- preview_start_resolution: IntProperty(
- name="Start Resolution",
- description="Resolution to start rendering preview at, "
- "progressively increasing it to the full viewport size",
- min=8, max=16384,
- default=64,
- subtype='PIXEL'
- )
- preview_denoising_start_sample: IntProperty(
- name="Start Denoising",
- description="Sample to start denoising the preview at",
- min=0, max=(1 << 24),
- default=1,
- )
- preview_denoising_input_passes: EnumProperty(
- name="Viewport Input Passes",
- description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
- items=enum_denoising_input_passes,
- default='RGB_ALBEDO',
- )
-
- debug_reset_timeout: FloatProperty(
- name="Reset timeout",
- description="",
- min=0.01, max=10.0,
- default=0.1,
- )
- debug_cancel_timeout: FloatProperty(
- name="Cancel timeout",
- description="",
- min=0.01, max=10.0,
- default=0.1,
- )
- debug_text_timeout: FloatProperty(
- name="Text timeout",
- description="",
- min=0.01, max=10.0,
- default=1.0,
- )
-
debug_bvh_type: EnumProperty(
name="Viewport BVH Type",
description="Choose between faster updates, or faster render",
@@ -701,38 +625,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default=0,
min=0, max=16,
)
- tile_order: EnumProperty(
- name="Tile Order",
- description="Tile order for rendering",
- items=enum_tile_order,
- default='HILBERT_SPIRAL',
- options=set(), # Not animatable!
- )
- use_progressive_refine: BoolProperty(
- name="Progressive Refine",
- description="Instead of rendering each tile until it is finished, "
- "refine the whole image progressively "
- "(this renders somewhat slower, "
- "but time can be saved by manually stopping the render when the noise is low enough)",
- default=False,
- )
bake_type: EnumProperty(
name="Bake Type",
default='COMBINED',
description="Type of pass to bake",
items=(
- ('COMBINED', "Combined", ""),
- ('AO', "Ambient Occlusion", ""),
- ('SHADOW', "Shadow", ""),
- ('NORMAL', "Normal", ""),
- ('UV', "UV", ""),
- ('ROUGHNESS', "Roughness", ""),
- ('EMIT', "Emit", ""),
- ('ENVIRONMENT', "Environment", ""),
- ('DIFFUSE', "Diffuse", ""),
- ('GLOSSY', "Glossy", ""),
- ('TRANSMISSION', "Transmission", ""),
+ ('COMBINED', "Combined", "", 0),
+ ('AO', "Ambient Occlusion", "", 1),
+ ('SHADOW', "Shadow", "", 2),
+ ('POSITION', "Position", "", 11),
+ ('NORMAL', "Normal", "", 3),
+ ('UV', "UV", "", 4),
+ ('ROUGHNESS', "Roughness", "", 5),
+ ('EMIT', "Emit", "", 6),
+ ('ENVIRONMENT', "Environment", "", 7),
+ ('DIFFUSE', "Diffuse", "", 8),
+ ('GLOSSY', "Glossy", "", 9),
+ ('TRANSMISSION', "Transmission", "", 10),
),
)
@@ -827,6 +737,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
min=0, max=1024,
)
+ use_auto_tile: BoolProperty(
+ name="Auto Tiles",
+ description="Automatically split image into tiles",
+ default=True,
+ )
+ tile_size: IntProperty(
+ name="Tile Size",
+ default=2048,
+ description="",
+ min=0, max=16384,
+ )
+
# Various fine-tuning debug flags
def _devices_update_callback(self, context):
@@ -844,45 +766,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
items=enum_bvh_layouts,
default='EMBREE',
)
- debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
- debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
-
- debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
- debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
-
- debug_opencl_kernel_type: EnumProperty(
- name="OpenCL Kernel Type",
- default='DEFAULT',
- items=(
- ('DEFAULT', "Default", ""),
- ('MEGA', "Mega", ""),
- ('SPLIT', "Split", ""),
- ),
- update=CyclesRenderSettings._devices_update_callback
- )
- debug_opencl_device_type: EnumProperty(
- name="OpenCL Device Type",
- default='ALL',
- items=(
- ('NONE', "None", ""),
- ('ALL', "All", ""),
- ('DEFAULT', "Default", ""),
- ('CPU', "CPU", ""),
- ('GPU', "GPU", ""),
- ('ACCELERATOR', "Accelerator", ""),
- ),
- update=CyclesRenderSettings._devices_update_callback
- )
-
- debug_use_opencl_debug: BoolProperty(name="Debug OpenCL", default=False)
-
- debug_opencl_mem_limit: IntProperty(
- name="Memory limit",
- default=0,
- description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)"
+ debug_use_optix_debug: BoolProperty(
+ name="OptiX Module Debug",
+ description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level",
+ default=False
)
@classmethod
@@ -1031,12 +921,6 @@ class CyclesLightSettings(bpy.types.PropertyGroup):
description="Light casts shadows",
default=True,
)
- samples: IntProperty(
- name="Samples",
- description="Number of light samples to render for each AA sample",
- min=1, max=10000,
- default=1,
- )
max_bounces: IntProperty(
name="Max Bounces",
description="Maximum number of bounces the light will contribute to the render",
@@ -1084,12 +968,6 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
min=4, max=8192,
default=1024,
)
- samples: IntProperty(
- name="Samples",
- description="Number of light samples to render for each AA sample",
- min=1, max=10000,
- default=1,
- )
max_bounces: IntProperty(
name="Max Bounces",
description="Maximum number of bounces the background light will contribute to the render",
@@ -1343,91 +1221,25 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
update=update_render_passes,
)
+ use_pass_shadow_catcher: BoolProperty(
+ name="Shadow Catcher",
+ description="Pass containing shadows and light which is to be multiplied into backdrop",
+ default=False,
+ update=update_render_passes,
+ )
+
use_denoising: BoolProperty(
name="Use Denoising",
description="Denoise the rendered image",
default=True,
update=update_render_passes,
)
- denoising_diffuse_direct: BoolProperty(
- name="Diffuse Direct",
- description="Denoise the direct diffuse lighting",
- default=True,
- )
- denoising_diffuse_indirect: BoolProperty(
- name="Diffuse Indirect",
- description="Denoise the indirect diffuse lighting",
- default=True,
- )
- denoising_glossy_direct: BoolProperty(
- name="Glossy Direct",
- description="Denoise the direct glossy lighting",
- default=True,
- )
- denoising_glossy_indirect: BoolProperty(
- name="Glossy Indirect",
- description="Denoise the indirect glossy lighting",
- default=True,
- )
- denoising_transmission_direct: BoolProperty(
- name="Transmission Direct",
- description="Denoise the direct transmission lighting",
- default=True,
- )
- denoising_transmission_indirect: BoolProperty(
- name="Transmission Indirect",
- description="Denoise the indirect transmission lighting",
- default=True,
- )
- denoising_strength: FloatProperty(
- name="Denoising Strength",
- description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
- min=0.0, max=1.0,
- default=0.5,
- )
- denoising_feature_strength: FloatProperty(
- name="Denoising Feature Strength",
- description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
- min=0.0, max=1.0,
- default=0.5,
- )
- denoising_radius: IntProperty(
- name="Denoising Radius",
- description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
- min=1, max=25,
- default=8,
- subtype="PIXEL",
- )
- denoising_relative_pca: BoolProperty(
- name="Relative Filter",
- description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
- default=False,
- )
denoising_store_passes: BoolProperty(
name="Store Denoising Passes",
description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering",
default=False,
update=update_render_passes,
)
- denoising_neighbor_frames: IntProperty(
- name="Neighbor Frames",
- description="Number of neighboring frames to use for denoising animations (more frames produce smoother results at the cost of performance)",
- min=0, max=7,
- default=0,
- )
-
- denoising_optix_input_passes: EnumProperty(
- name="Input Passes",
- description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
- items=enum_denoising_input_passes,
- default='RGB_ALBEDO',
- )
- denoising_openimagedenoise_input_passes: EnumProperty(
- name="Input Passes",
- description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
- items=enum_denoising_input_passes,
- default='RGB_ALBEDO_NORMAL',
- )
@classmethod
def register(cls):
@@ -1454,14 +1266,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def get_device_types(self, context):
import _cycles
- has_cuda, has_optix, has_opencl = _cycles.get_device_types()
+ has_cuda, has_optix = _cycles.get_device_types()
list = [('NONE', "None", "Don't use compute device", 0)]
if has_cuda:
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
if has_optix:
list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
- if has_opencl:
- list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
return list
compute_device_type: EnumProperty(
@@ -1486,7 +1296,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def update_device_entries(self, device_list):
for device in device_list:
- if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
+ if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
continue
# Try to find existing Device entry
entry = self.find_existing_device_entry(device)
@@ -1520,22 +1330,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
elif entry.type == 'CPU':
cpu_devices.append(entry)
# Extend all GPU devices with CPU.
- if compute_device_type in {'CUDA', 'OPTIX', 'OPENCL'}:
+ if compute_device_type != 'CPU':
devices.extend(cpu_devices)
return devices
- # For backwards compatibility, only returns CUDA and OpenCL but still
- # refreshes all devices.
- def get_devices(self, compute_device_type=''):
+ # Refresh device list. This does not happen automatically on Blender
+ # startup due to unstable OpenCL implementations that can cause crashes.
+ def refresh_devices(self):
import _cycles
# Ensure `self.devices` is not re-allocated when the second call to
# get_devices_for_type is made, freeing items from the first list.
for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
self.update_device_entries(_cycles.available_devices(device_type))
- cuda_devices = self.get_devices_for_type('CUDA')
- opencl_devices = self.get_devices_for_type('OPENCL')
- return cuda_devices, opencl_devices
+ # Deprecated: use refresh_devices instead.
+ def get_devices(self, compute_device_type=''):
+ self.refresh_devices()
+ return None
def get_num_gpu_devices(self):
import _cycles
@@ -1601,6 +1412,10 @@ class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
items=enum_view3d_shading_render_pass,
default='COMBINED',
)
+ show_active_pixels: BoolProperty(
+ name="Show Active Pixels",
+ description="When using adaptive sampling highlight pixels which are being sampled",
+ )
def register():
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 47f7b4c6d73..d02627b9936 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -34,6 +34,12 @@ class CYCLES_PT_sampling_presets(PresetPanel, Panel):
preset_add_operator = "render.cycles_sampling_preset_add"
COMPAT_ENGINES = {'CYCLES'}
+class CYCLES_PT_viewport_sampling_presets(PresetPanel, Panel):
+ bl_label = "Viewport Sampling Presets"
+ preset_subdir = "cycles/viewport_sampling"
+ preset_operator = "script.execute_preset"
+ preset_add_operator = "render.cycles_viewport_sampling_preset_add"
+ COMPAT_ENGINES = {'CYCLES'}
class CYCLES_PT_integrator_presets(PresetPanel, Panel):
bl_label = "Integrator Presets"
@@ -54,6 +60,15 @@ class CyclesButtonsPanel:
return context.engine in cls.COMPAT_ENGINES
+class CyclesDebugButtonsPanel(CyclesButtonsPanel):
+ @classmethod
+ def poll(cls, context):
+ prefs = bpy.context.preferences
+ return (CyclesButtonsPanel.poll(context)
+ and prefs.experimental.use_cycles_debug
+ and prefs.view.show_developer_ui)
+
+
# Adapt properties editor panel to display in node editor. We have to
# copy the class rather than inherit due to the way bpy registration works.
def node_panel(cls):
@@ -78,12 +93,6 @@ def use_cpu(context):
return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
-def use_opencl(context):
- cscene = context.scene.cycles
-
- return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU')
-
-
def use_cuda(context):
cscene = context.scene.cycles
@@ -96,12 +105,6 @@ def use_optix(context):
return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
-def use_branched_path(context):
- cscene = context.scene.cycles
-
- return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
-
-
def use_sample_all_lights(context):
cscene = context.scene.cycles
@@ -115,57 +118,33 @@ def show_device_active(context):
return context.preferences.addons[__package__].preferences.has_active_device()
-def draw_samples_info(layout, context):
- cscene = context.scene.cycles
- integrator = cscene.progressive
+def get_effective_preview_denoiser(context):
+ scene = context.scene
+ cscene = scene.cycles
+
+ if cscene.preview_denoiser != "AUTO":
+ return cscene.preview_denoiser
+
+ if context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'):
+ return 'OPTIX'
+
+ return 'OIDN'
- # Calculate sample values
- if integrator == 'PATH':
- aa = cscene.samples
- if cscene.use_square_samples:
- aa = aa * aa
- else:
- aa = cscene.aa_samples
- d = cscene.diffuse_samples
- g = cscene.glossy_samples
- t = cscene.transmission_samples
- ao = cscene.ao_samples
- ml = cscene.mesh_light_samples
- sss = cscene.subsurface_samples
- vol = cscene.volume_samples
-
- if cscene.use_square_samples:
- aa = aa * aa
- d = d * d
- g = g * g
- t = t * t
- ao = ao * ao
- ml = ml * ml
- sss = sss * sss
- vol = vol * vol
-
- # Draw interface
- # Do not draw for progressive, when Square Samples are disabled
- if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
- col = layout.column(align=True)
- col.scale_y = 0.6
- col.label(text="Total Samples:")
- col.separator()
- if integrator == 'PATH':
- col.label(text="%s AA" % aa)
- else:
- col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
- (aa, d * aa, g * aa, t * aa))
- col.separator()
- col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
- (ao * aa, ml * aa, sss * aa, vol * aa))
class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
bl_label = "Sampling"
+ def draw(self, context):
+ pass
+
+
+class CYCLES_RENDER_PT_sampling_viewport(CyclesButtonsPanel, Panel):
+ bl_label = "Viewport"
+ bl_parent_id = "CYCLES_RENDER_PT_sampling"
+
def draw_header_preset(self, context):
- CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
+ CYCLES_PT_viewport_sampling_presets.draw_panel_header(self.layout)
def draw(self, context):
layout = self.layout
@@ -176,29 +155,31 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
layout.use_property_split = True
layout.use_property_decorate = False
- if not use_optix(context):
- layout.prop(cscene, "progressive")
+ heading = layout.column(align=True, heading="Noise Threshold")
+ row = heading.row(align=True)
+ row.prop(cscene, "use_preview_adaptive_sampling", text="")
+ sub = row.row()
+ sub.active = cscene.use_preview_adaptive_sampling
+ sub.prop(cscene, "preview_adaptive_threshold", text="")
- if not use_branched_path(context):
+ if cscene.use_preview_adaptive_sampling:
col = layout.column(align=True)
- col.prop(cscene, "samples", text="Render")
- col.prop(cscene, "preview_samples", text="Viewport")
+ col.prop(cscene, "preview_samples", text=" Max Samples")
+ col.prop(cscene, "preview_adaptive_min_samples", text="Min Samples")
else:
- col = layout.column(align=True)
- col.prop(cscene, "aa_samples", text="Render")
- col.prop(cscene, "preview_aa_samples", text="Viewport")
+ layout.prop(cscene, "preview_samples", text="Samples")
- if not use_branched_path(context):
- draw_samples_info(layout, context)
+class CYCLES_RENDER_PT_sampling_viewport_denoise(CyclesButtonsPanel, Panel):
+ bl_label = "Denoise"
+ bl_parent_id = 'CYCLES_RENDER_PT_sampling_viewport'
+ bl_options = {'DEFAULT_CLOSED'}
-class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
- bl_label = "Sub Samples"
- bl_parent_id = "CYCLES_RENDER_PT_sampling"
+ def draw_header(self, context):
+ scene = context.scene
+ cscene = scene.cycles
- @classmethod
- def poll(cls, context):
- return use_branched_path(context)
+ self.layout.prop(context.scene.cycles, "use_preview_denoising", text="")
def draw(self, context):
layout = self.layout
@@ -208,53 +189,61 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
scene = context.scene
cscene = scene.cycles
- col = layout.column(align=True)
- col.prop(cscene, "diffuse_samples", text="Diffuse")
- col.prop(cscene, "glossy_samples", text="Glossy")
- col.prop(cscene, "transmission_samples", text="Transmission")
- col.prop(cscene, "ao_samples", text="AO")
+ col = layout.column()
+ col.active = cscene.use_preview_denoising
+ col.prop(cscene, "preview_denoiser", text="Denoiser")
+ col.prop(cscene, "preview_denoising_input_passes", text="Passes")
- sub = col.row(align=True)
- sub.active = use_sample_all_lights(context)
- sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
- col.prop(cscene, "subsurface_samples", text="Subsurface")
- col.prop(cscene, "volume_samples", text="Volume")
+ effective_preview_denoiser = get_effective_preview_denoiser(context)
+ if effective_preview_denoiser == 'OPENIMAGEDENOISE':
+ col.prop(cscene, "preview_denoising_prefilter", text="Prefilter")
- draw_samples_info(layout, context)
+ col.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
-class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
- bl_label = "Adaptive Sampling"
+class CYCLES_RENDER_PT_sampling_render(CyclesButtonsPanel, Panel):
+ bl_label = "Render"
bl_parent_id = "CYCLES_RENDER_PT_sampling"
- bl_options = {'DEFAULT_CLOSED'}
- def draw_header(self, context):
- layout = self.layout
- scene = context.scene
- cscene = scene.cycles
-
- layout.prop(cscene, "use_adaptive_sampling", text="")
+ def draw_header_preset(self, context):
+ CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
def draw(self, context):
layout = self.layout
- layout.use_property_split = True
- layout.use_property_decorate = False
scene = context.scene
cscene = scene.cycles
- layout.active = cscene.use_adaptive_sampling
+ layout.use_property_split = True
+ layout.use_property_decorate = False
+
+ heading = layout.column(align=True, heading="Noise Threshold")
+ row = heading.row(align=True)
+ row.prop(cscene, "use_adaptive_sampling", text="")
+ sub = row.row()
+ sub.active = cscene.use_adaptive_sampling
+ sub.prop(cscene, "adaptive_threshold", text="")
col = layout.column(align=True)
- col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
- col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+ if cscene.use_adaptive_sampling:
+ col.prop(cscene, "samples", text=" Max Samples")
+ col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+ else:
+ col.prop(cscene, "samples", text="Samples")
+ col.prop(cscene, "time_limit")
-class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
- bl_label = "Denoising"
- bl_parent_id = "CYCLES_RENDER_PT_sampling"
+class CYCLES_RENDER_PT_sampling_render_denoise(CyclesButtonsPanel, Panel):
+ bl_label = "Denoise"
+ bl_parent_id = 'CYCLES_RENDER_PT_sampling_render'
bl_options = {'DEFAULT_CLOSED'}
+ def draw_header(self, context):
+ scene = context.scene
+ cscene = scene.cycles
+
+ self.layout.prop(context.scene.cycles, "use_denoising", text="")
+
def draw(self, context):
layout = self.layout
layout.use_property_split = True
@@ -263,33 +252,12 @@ class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
scene = context.scene
cscene = scene.cycles
- heading = layout.column(align=True, heading="Render")
- row = heading.row(align=True)
- row.prop(cscene, "use_denoising", text="")
- sub = row.row()
-
- sub.active = cscene.use_denoising
- for view_layer in scene.view_layers:
- if view_layer.cycles.denoising_store_passes:
- sub.active = True
-
- sub.prop(cscene, "denoiser", text="")
-
- layout.separator()
-
- heading = layout.column(align=False, heading="Viewport")
- row = heading.row(align=True)
- row.prop(cscene, "use_preview_denoising", text="")
- sub = row.row()
- sub.active = cscene.use_preview_denoising
- sub.prop(cscene, "preview_denoiser", text="")
-
- sub = heading.row(align=True)
- sub.active = cscene.use_preview_denoising
- sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
- sub = heading.row(align=True)
- sub.active = cscene.use_preview_denoising
- sub.prop(cscene, "preview_denoising_input_passes", text="Input Passes")
+ col = layout.column()
+ col.active = cscene.use_denoising
+ col.prop(cscene, "denoiser", text="Denoiser")
+ col.prop(cscene, "denoising_input_passes", text="Passes")
+ if cscene.denoiser == 'OPENIMAGEDENOISE':
+ col.prop(cscene, "denoising_prefilter", text="Prefilter")
class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
@@ -313,8 +281,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
col.active = not(cscene.use_adaptive_sampling)
col.prop(cscene, "sampling_pattern", text="Pattern")
- layout.prop(cscene, "use_square_samples")
-
layout.separator()
col = layout.column(align=True)
@@ -322,11 +288,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
col.prop(cscene, "min_transparent_bounces")
col.prop(cscene, "light_sampling_threshold", text="Light Threshold")
- if cscene.progressive != 'PATH' and use_branched_path(context):
- col = layout.column(align=True)
- col.prop(cscene, "sample_all_lights_direct")
- col.prop(cscene, "sample_all_lights_indirect")
-
for view_layer in scene.view_layers:
if view_layer.samples > 0:
layout.separator()
@@ -334,62 +295,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
break
-class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel):
- bl_label = "Total Samples"
- bl_parent_id = "CYCLES_RENDER_PT_sampling"
-
- @classmethod
- def poll(cls, context):
- scene = context.scene
- cscene = scene.cycles
-
- if cscene.use_square_samples:
- return True
-
- return cscene.progressive != 'PATH' and use_branched_path(context)
-
- def draw(self, context):
- layout = self.layout
- cscene = context.scene.cycles
- integrator = cscene.progressive
-
- # Calculate sample values
- if integrator == 'PATH':
- aa = cscene.samples
- if cscene.use_square_samples:
- aa = aa * aa
- else:
- aa = cscene.aa_samples
- d = cscene.diffuse_samples
- g = cscene.glossy_samples
- t = cscene.transmission_samples
- ao = cscene.ao_samples
- ml = cscene.mesh_light_samples
- sss = cscene.subsurface_samples
- vol = cscene.volume_samples
-
- if cscene.use_square_samples:
- aa = aa * aa
- d = d * d
- g = g * g
- t = t * t
- ao = ao * ao
- ml = ml * ml
- sss = sss * sss
- vol = vol * vol
-
- col = layout.column(align=True)
- col.scale_y = 0.6
- if integrator == 'PATH':
- col.label(text="%s AA" % aa)
- else:
- col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
- (aa, d * aa, g * aa, t * aa))
- col.separator()
- col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
- (ao * aa, ml * aa, sss * aa, vol * aa))
-
-
class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
bl_label = "Subdivision"
bl_options = {'DEFAULT_CLOSED'}
@@ -548,6 +453,8 @@ class CYCLES_RENDER_PT_light_paths_fast_gi(CyclesButtonsPanel, Panel):
layout.use_property_split = True
layout.use_property_decorate = False
+ layout.active = cscene.use_fast_gi
+
col = layout.column(align=True)
col.prop(cscene, "ao_bounces", text="Viewport Bounces")
col.prop(cscene, "ao_bounces_render", text="Render Bounces")
@@ -716,19 +623,13 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
layout.use_property_decorate = False
scene = context.scene
- rd = scene.render
cscene = scene.cycles
col = layout.column()
-
- sub = col.column(align=True)
- sub.prop(rd, "tile_x", text="Tiles X")
- sub.prop(rd, "tile_y", text="Y")
- col.prop(cscene, "tile_order", text="Order")
-
+ col.prop(cscene, "use_auto_tile")
sub = col.column()
- sub.active = not rd.use_save_buffers and not cscene.use_adaptive_sampling
- sub.prop(cscene, "use_progressive_refine")
+ sub.active = cscene.use_auto_tile
+ sub.prop(cscene, "tile_size")
class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Panel):
@@ -778,7 +679,6 @@ class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel):
col = layout.column()
- col.prop(rd, "use_save_buffers")
col.prop(rd, "use_persistent_data", text="Persistent Data")
@@ -797,7 +697,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
col = layout.column()
col.prop(rd, "preview_pixel_size", text="Pixel Size")
- col.prop(cscene, "preview_start_resolution", text="Start Pixels")
class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
@@ -818,7 +717,6 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
col = layout.column(heading="Include")
col.prop(view_layer, "use_sky", text="Environment")
- col.prop(view_layer, "use_ao", text="Ambient Occlusion")
col.prop(view_layer, "use_solid", text="Surfaces")
col.prop(view_layer, "use_strand", text="Hair")
col.prop(view_layer, "use_volumes", text="Volumes")
@@ -827,6 +725,9 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
sub = col.row()
sub.prop(view_layer, "use_motion_blur", text="Motion Blur")
sub.active = rd.use_motion_blur
+ sub = col.row()
+ sub.prop(view_layer.cycles, 'use_denoising', text='Denoising')
+ sub.active = scene.cycles.use_denoising
class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
@@ -872,6 +773,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
col.prop(view_layer, "use_pass_combined")
col.prop(view_layer, "use_pass_z")
col.prop(view_layer, "use_pass_mist")
+ col.prop(view_layer, "use_pass_position")
col.prop(view_layer, "use_pass_normal")
sub = col.column()
sub.active = not rd.use_motion_blur
@@ -928,6 +830,7 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
col.prop(view_layer, "use_pass_environment")
col.prop(view_layer, "use_pass_shadow")
col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion")
+ col.prop(cycles_view_layer, "use_pass_shadow_catcher")
class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, ViewLayerCryptomattePanel, Panel):
@@ -942,70 +845,6 @@ class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, ViewLayerAOVPanel):
bl_parent_id = "CYCLES_RENDER_PT_passes"
-class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
- bl_label = "Denoising"
- bl_context = "view_layer"
- bl_options = {'DEFAULT_CLOSED'}
-
- @classmethod
- def poll(cls, context):
- cscene = context.scene.cycles
- return CyclesButtonsPanel.poll(context) and cscene.use_denoising
-
- def draw_header(self, context):
- scene = context.scene
- view_layer = context.view_layer
- cycles_view_layer = view_layer.cycles
-
- layout = self.layout
- layout.prop(cycles_view_layer, "use_denoising", text="")
-
- def draw(self, context):
- layout = self.layout
- layout.use_property_split = True
- layout.use_property_decorate = False
-
- scene = context.scene
- view_layer = context.view_layer
- cycles_view_layer = view_layer.cycles
- denoiser = scene.cycles.denoiser
-
- layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
-
- col = layout.column()
-
- if denoiser == 'OPTIX':
- col.prop(cycles_view_layer, "denoising_optix_input_passes")
- return
- elif denoiser == 'OPENIMAGEDENOISE':
- col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes")
- return
-
- col.prop(cycles_view_layer, "denoising_radius", text="Radius")
-
- col = layout.column()
- col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
- col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
- col.prop(cycles_view_layer, "denoising_relative_pca")
-
- layout.separator()
-
- col = layout.column()
- col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
- row = col.row(heading="Diffuse", align=True)
- row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
- row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
-
- row = col.row(heading="Glossy", align=True)
- row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True)
- row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True)
-
- row = col.row(heading="Transmission", align=True)
- row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True)
- row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True)
-
-
class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
bl_label = "Post Processing"
bl_options = {'DEFAULT_CLOSED'}
@@ -1417,10 +1256,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
if not (light.type == 'AREA' and clamp.is_portal):
sub = col.column()
- if use_branched_path(context):
- subsub = sub.row(align=True)
- subsub.active = use_sample_all_lights(context)
- subsub.prop(clamp, "samples")
sub.prop(clamp, "max_bounces")
sub = col.column(align=True)
@@ -1526,34 +1361,6 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
-class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
- bl_label = "Ambient Occlusion"
- bl_context = "world"
- bl_options = {'DEFAULT_CLOSED'}
-
- @classmethod
- def poll(cls, context):
- return context.world and CyclesButtonsPanel.poll(context)
-
- def draw_header(self, context):
- light = context.world.light_settings
- self.layout.prop(light, "use_ambient_occlusion", text="")
-
- def draw(self, context):
- layout = self.layout
- layout.use_property_split = True
- layout.use_property_decorate = False
-
- light = context.world.light_settings
- scene = context.scene
-
- col = layout.column()
- sub = col.column()
- sub.active = light.use_ambient_occlusion or scene.render.use_simplify
- sub.prop(light, "ao_factor", text="Factor")
- col.prop(light, "distance", text="Distance")
-
-
class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
bl_label = "Mist Pass"
bl_context = "world"
@@ -1650,10 +1457,6 @@ class CYCLES_WORLD_PT_settings_surface(CyclesButtonsPanel, Panel):
subsub = sub.row(align=True)
subsub.active = cworld.sampling_method == 'MANUAL'
subsub.prop(cworld, "sample_map_resolution")
- if use_branched_path(context):
- subsub = sub.column(align=True)
- subsub.active = use_sample_all_lights(context)
- subsub.prop(cworld, "samples")
sub.prop(cworld, "max_bounces")
@@ -1677,8 +1480,7 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
col = layout.column()
sub = col.column()
- sub.active = use_cpu(context)
- sub.prop(cworld, "volume_sampling", text="Sampling")
+ col.prop(cworld, "volume_sampling", text="Sampling")
col.prop(cworld, "volume_interpolation", text="Interpolation")
col.prop(cworld, "homogeneous_volume", text="Homogeneous")
sub = col.column()
@@ -1817,8 +1619,7 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
col = layout.column()
sub = col.column()
- sub.active = use_cpu(context)
- sub.prop(cmat, "volume_sampling", text="Sampling")
+ col.prop(cmat, "volume_sampling", text="Sampling")
col.prop(cmat, "volume_interpolation", text="Interpolation")
col.prop(cmat, "homogeneous_volume", text="Homogeneous")
sub = col.column()
@@ -1845,9 +1646,6 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
cbk = scene.render.bake
rd = scene.render
- if use_optix(context):
- layout.label(text="Baking is performed using CUDA instead of OptiX", icon='INFO')
-
if rd.use_bake_multires:
layout.operator("object.bake_image", icon='RENDER_STILL')
layout.prop(rd, "use_bake_multires")
@@ -1905,7 +1703,6 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
col.prop(cbk, "use_pass_diffuse")
col.prop(cbk, "use_pass_glossy")
col.prop(cbk, "use_pass_transmission")
- col.prop(cbk, "use_pass_ambient_occlusion")
col.prop(cbk, "use_pass_emit")
elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
@@ -1989,19 +1786,12 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
layout.prop(cbk, "use_clear", text="Clear Image")
-class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
bl_label = "Debug"
bl_context = "render"
bl_options = {'DEFAULT_CLOSED'}
COMPAT_ENGINES = {'CYCLES'}
- @classmethod
- def poll(cls, context):
- prefs = bpy.context.preferences
- return (CyclesButtonsPanel.poll(context)
- and prefs.experimental.use_cycles_debug
- and prefs.view.show_developer_ui)
-
def draw(self, context):
layout = self.layout
@@ -2018,29 +1808,18 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_bvh_layout")
- col.prop(cscene, "debug_use_cpu_split_kernel")
col.separator()
col = layout.column()
col.label(text="CUDA Flags:")
col.prop(cscene, "debug_use_cuda_adaptive_compile")
- col.prop(cscene, "debug_use_cuda_split_kernel")
col.separator()
col = layout.column()
col.label(text="OptiX Flags:")
- col.prop(cscene, "debug_optix_cuda_streams")
- col.prop(cscene, "debug_optix_curves_api")
-
- col.separator()
-
- col = layout.column()
- col.label(text="OpenCL Flags:")
- col.prop(cscene, "debug_opencl_device_type", text="Device")
- col.prop(cscene, "debug_use_opencl_debug", text="Debug")
- col.prop(cscene, "debug_opencl_mem_limit")
+ col.prop(cscene, "debug_use_optix_debug")
col.separator()
@@ -2141,20 +1920,22 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel):
sub.prop(cscene, "distance_cull_margin", text="")
-class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
+class CyclesShadingButtonsPanel(CyclesButtonsPanel):
bl_space_type = 'VIEW_3D'
bl_region_type = 'HEADER'
- bl_label = "Render Pass"
bl_parent_id = 'VIEW3D_PT_shading'
- COMPAT_ENGINES = {'CYCLES'}
@classmethod
def poll(cls, context):
return (
- context.engine in cls.COMPAT_ENGINES and
+ CyclesButtonsPanel.poll(context) and
context.space_data.shading.type == 'RENDERED'
)
+
+class CYCLES_VIEW3D_PT_shading_render_pass(CyclesShadingButtonsPanel, Panel):
+ bl_label = "Render Pass"
+
def draw(self, context):
shading = context.space_data.shading
@@ -2162,6 +1943,26 @@ class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
layout.prop(shading.cycles, "render_pass", text="")
+class CYCLES_VIEW3D_PT_shading_debug(CyclesDebugButtonsPanel,
+ CyclesShadingButtonsPanel,
+ Panel):
+ bl_label = "Debug"
+
+ @classmethod
+ def poll(cls, context):
+ return (
+ CyclesDebugButtonsPanel.poll(context) and
+ CyclesShadingButtonsPanel.poll(context)
+ )
+
+ def draw(self, context):
+ shading = context.space_data.shading
+
+ layout = self.layout
+ layout.active = context.scene.cycles.use_preview_adaptive_sampling
+ layout.prop(shading.cycles, "show_active_pixels")
+
+
class CYCLES_VIEW3D_PT_shading_lighting(Panel):
bl_space_type = 'VIEW_3D'
bl_region_type = 'HEADER'
@@ -2275,11 +2076,13 @@ def get_panels():
classes = (
CYCLES_PT_sampling_presets,
+ CYCLES_PT_viewport_sampling_presets,
CYCLES_PT_integrator_presets,
CYCLES_RENDER_PT_sampling,
- CYCLES_RENDER_PT_sampling_sub_samples,
- CYCLES_RENDER_PT_sampling_adaptive,
- CYCLES_RENDER_PT_sampling_denoising,
+ CYCLES_RENDER_PT_sampling_viewport,
+ CYCLES_RENDER_PT_sampling_viewport_denoise,
+ CYCLES_RENDER_PT_sampling_render,
+ CYCLES_RENDER_PT_sampling_render_denoise,
CYCLES_RENDER_PT_sampling_advanced,
CYCLES_RENDER_PT_light_paths,
CYCLES_RENDER_PT_light_paths_max_bounces,
@@ -2296,6 +2099,7 @@ classes = (
CYCLES_VIEW3D_PT_simplify_greasepencil,
CYCLES_VIEW3D_PT_shading_lighting,
CYCLES_VIEW3D_PT_shading_render_pass,
+ CYCLES_VIEW3D_PT_shading_debug,
CYCLES_RENDER_PT_motion_blur,
CYCLES_RENDER_PT_motion_blur_curve,
CYCLES_RENDER_PT_film,
@@ -2314,7 +2118,6 @@ classes = (
CYCLES_RENDER_PT_passes_aov,
CYCLES_RENDER_PT_filter,
CYCLES_RENDER_PT_override,
- CYCLES_RENDER_PT_denoising,
CYCLES_PT_post_processing,
CYCLES_CAMERA_PT_dof,
CYCLES_CAMERA_PT_dof_aperture,
@@ -2333,7 +2136,6 @@ classes = (
CYCLES_WORLD_PT_preview,
CYCLES_WORLD_PT_surface,
CYCLES_WORLD_PT_volume,
- CYCLES_WORLD_PT_ambient_occlusion,
CYCLES_WORLD_PT_mist,
CYCLES_WORLD_PT_ray_visibility,
CYCLES_WORLD_PT_settings,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 827f84b9873..57da7d7995c 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -109,7 +109,7 @@ def do_versions(self):
library_versions.setdefault(library.version, []).append(library)
# Do versioning per library, since they might have different versions.
- max_need_versioning = (2, 93, 7)
+ max_need_versioning = (3, 0, 25)
for version, libraries in library_versions.items():
if version > max_need_versioning:
continue
@@ -166,10 +166,6 @@ def do_versions(self):
if not cscene.is_property_set("filter_type"):
cscene.pixel_filter_type = 'GAUSSIAN'
- # Tile Order
- if not cscene.is_property_set("tile_order"):
- cscene.tile_order = 'CENTER'
-
if version <= (2, 76, 10):
cscene = scene.cycles
if cscene.is_property_set("filter_type"):
@@ -186,10 +182,6 @@ def do_versions(self):
if version <= (2, 79, 0):
cscene = scene.cycles
# Default changes
- if not cscene.is_property_set("aa_samples"):
- cscene.aa_samples = 4
- if not cscene.is_property_set("preview_aa_samples"):
- cscene.preview_aa_samples = 4
if not cscene.is_property_set("blur_glossy"):
cscene.blur_glossy = 0.0
if not cscene.is_property_set("sample_clamp_indirect"):
@@ -203,7 +195,6 @@ def do_versions(self):
view_layer.use_pass_cryptomatte_material = cview_layer.get("use_pass_crypto_material", False)
view_layer.use_pass_cryptomatte_asset = cview_layer.get("use_pass_crypto_asset", False)
view_layer.pass_cryptomatte_depth = cview_layer.get("pass_crypto_depth", 6)
- view_layer.use_pass_cryptomatte_accurate = cview_layer.get("pass_crypto_accurate", True)
if version <= (2, 93, 7):
if scene.render.engine == 'CYCLES':
@@ -229,6 +220,35 @@ def do_versions(self):
cscene.ao_bounces = 1
cscene.ao_bounces_render = 1
+ if version <= (3, 0, 25):
+ cscene = scene.cycles
+
+ # Default changes.
+ if not cscene.is_property_set("samples"):
+ cscene.samples = 128
+ if not cscene.is_property_set("preview_samples"):
+ cscene.preview_samples = 32
+ if not cscene.is_property_set("use_adaptive_sampling"):
+ cscene.use_adaptive_sampling = False
+ cscene.use_preview_adaptive_sampling = False
+ if not cscene.is_property_set("use_denoising"):
+ cscene.use_denoising = False
+ if not cscene.is_property_set("use_preview_denoising"):
+ cscene.use_preview_denoising = False
+ if not cscene.is_property_set("sampling_pattern"):
+ cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER'
+
+ # Removal of square samples.
+ cscene = scene.cycles
+ use_square_samples = cscene.get("use_square_samples", False)
+
+ if use_square_samples:
+ cscene.samples *= cscene.samples
+ cscene.preview_samples *= cscene.preview_samples
+ for layer in scene.view_layers:
+ layer.samples *= layer.samples
+ cscene["use_square_samples"] = False
+
# Lamps
for light in bpy.data.lights:
if light.library not in libraries:
@@ -249,10 +269,6 @@ def do_versions(self):
if version <= (2, 76, 9):
cworld = world.cycles
- # World MIS Samples
- if not cworld.is_property_set("samples"):
- cworld.samples = 4
-
# World MIS Resolution
if not cworld.is_property_set("sample_map_resolution"):
cworld.sample_map_resolution = 256
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 6954c5c2f26..4e8df5a99a6 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -894,12 +894,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
}
}
-BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
- BL::RegionView3D &b_rv3d,
- Camera *cam,
- int width,
- int height,
- const bool use_denoiser)
+BufferParams BlenderSync::get_buffer_params(
+ BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height)
{
BufferParams params;
bool use_border = false;
@@ -931,11 +927,6 @@ BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
params.height = height;
}
- PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
-
- /* Can only denoise the combined image pass */
- params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
-
return params;
}
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index d51b31de638..ce1770f18a3 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -25,7 +25,6 @@ CCL_NAMESPACE_BEGIN
enum ComputeDevice {
COMPUTE_DEVICE_CPU = 0,
COMPUTE_DEVICE_CUDA = 1,
- COMPUTE_DEVICE_OPENCL = 2,
COMPUTE_DEVICE_OPTIX = 3,
COMPUTE_DEVICE_NUM
@@ -68,13 +67,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
device = Device::get_multi_device(devices, threads, background);
}
}
- else if (get_enum(cscene, "device") == 2) {
- /* Find network device. */
- vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
- if (!devices.empty()) {
- device = devices.front();
- }
- }
else if (get_enum(cscene, "device") == 1) {
/* Test if we are using GPU devices. */
ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -89,9 +81,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
else if (compute_device == COMPUTE_DEVICE_OPTIX) {
mask |= DEVICE_MASK_OPTIX;
}
- else if (compute_device == COMPUTE_DEVICE_OPENCL) {
- mask |= DEVICE_MASK_OPENCL;
- }
vector<DeviceInfo> devices = Device::available_devices(mask);
/* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
index b1de37dac10..fca8cb9eda3 100644
--- a/intern/cycles/blender/blender_geometry.cpp
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -80,7 +80,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
{
/* Test if we can instance or if the object is modified. */
Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair);
- GeometryKey key(b_ob_info.object_data, geom_type);
+ BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object :
+ b_ob_info.object_data;
+ GeometryKey key(b_key_id.ptr.data, geom_type);
/* Find shader indices. */
array<Node *> used_shaders = find_used_shaders(b_ob_info.iter_object);
@@ -110,7 +112,7 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
}
else {
/* Test if we need to update existing geometry. */
- sync = geometry_map.update(geom, b_ob_info.object_data);
+ sync = geometry_map.update(geom, b_key_id);
}
if (!sync) {
diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
new file mode 100644
index 00000000000..c5c3a2bd155
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -0,0 +1,787 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender/blender_gpu_display.h"
+
+#include "device/device.h"
+#include "util/util_logging.h"
+#include "util/util_opengl.h"
+
+extern "C" {
+struct RenderEngine;
+
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
+bool DRW_opengl_context_release();
+void DRW_opengl_context_activate(bool drw_state);
+
+void *WM_opengl_context_create();
+void WM_opengl_context_activate(void *gl_context);
+void WM_opengl_context_dispose(void *gl_context);
+void WM_opengl_context_release(void *context);
+}
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * BlenderDisplayShader.
+ */
+
+unique_ptr<BlenderDisplayShader> BlenderDisplayShader::create(BL::RenderEngine &b_engine,
+ BL::Scene &b_scene)
+{
+ if (b_engine.support_display_space_shader(b_scene)) {
+ return make_unique<BlenderDisplaySpaceShader>(b_engine, b_scene);
+ }
+
+ return make_unique<BlenderFallbackDisplayShader>();
+}
+
+int BlenderDisplayShader::get_position_attrib_location()
+{
+ if (position_attribute_location_ == -1) {
+ const uint shader_program = get_shader_program();
+ position_attribute_location_ = glGetAttribLocation(shader_program, position_attribute_name);
+ }
+ return position_attribute_location_;
+}
+
+int BlenderDisplayShader::get_tex_coord_attrib_location()
+{
+ if (tex_coord_attribute_location_ == -1) {
+ const uint shader_program = get_shader_program();
+ tex_coord_attribute_location_ = glGetAttribLocation(shader_program, tex_coord_attribute_name);
+ }
+ return tex_coord_attribute_location_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderFallbackDisplayShader.
+ */
+
+/* TODO move shaders to standalone .glsl file. */
+static const char *FALLBACK_VERTEX_SHADER =
+ "#version 330\n"
+ "uniform vec2 fullscreen;\n"
+ "in vec2 texCoord;\n"
+ "in vec2 pos;\n"
+ "out vec2 texCoord_interp;\n"
+ "\n"
+ "vec2 normalize_coordinates()\n"
+ "{\n"
+ " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+ "}\n"
+ "\n"
+ "void main()\n"
+ "{\n"
+ " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+ " texCoord_interp = texCoord;\n"
+ "}\n\0";
+
+static const char *FALLBACK_FRAGMENT_SHADER =
+ "#version 330\n"
+ "uniform sampler2D image_texture;\n"
+ "in vec2 texCoord_interp;\n"
+ "out vec4 fragColor;\n"
+ "\n"
+ "void main()\n"
+ "{\n"
+ " fragColor = texture(image_texture, texCoord_interp);\n"
+ "}\n\0";
+
+static void shader_print_errors(const char *task, const char *log, const char *code)
+{
+ LOG(ERROR) << "Shader: " << task << " error:";
+ LOG(ERROR) << "===== shader string ====";
+
+ stringstream stream(code);
+ string partial;
+
+ int line = 1;
+ while (getline(stream, partial, '\n')) {
+ if (line < 10) {
+ LOG(ERROR) << " " << line << " " << partial;
+ }
+ else {
+ LOG(ERROR) << line << " " << partial;
+ }
+ line++;
+ }
+ LOG(ERROR) << log;
+}
+
+static int compile_fallback_shader(void)
+{
+ const struct Shader {
+ const char *source;
+ const GLenum type;
+ } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+ {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
+
+ const GLuint program = glCreateProgram();
+
+ for (int i = 0; i < 2; i++) {
+ const GLuint shader = glCreateShader(shaders[i].type);
+
+ string source_str = shaders[i].source;
+ const char *c_str = source_str.c_str();
+
+ glShaderSource(shader, 1, &c_str, NULL);
+ glCompileShader(shader);
+
+ GLint compile_status;
+ glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+ if (!compile_status) {
+ GLchar log[5000];
+ GLsizei length = 0;
+ glGetShaderInfoLog(shader, sizeof(log), &length, log);
+ shader_print_errors("compile", log, c_str);
+ return 0;
+ }
+
+ glAttachShader(program, shader);
+ }
+
+ /* Link output. */
+ glBindFragDataLocation(program, 0, "fragColor");
+
+ /* Link and error check. */
+ glLinkProgram(program);
+
+ /* TODO(sergey): Find a way to nicely de-duplicate the error checking. */
+ GLint link_status;
+ glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+ if (!link_status) {
+ GLchar log[5000];
+ GLsizei length = 0;
+ /* TODO(sergey): Is it really program passed to glGetShaderInfoLog? */
+ glGetShaderInfoLog(program, sizeof(log), &length, log);
+ shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+ shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+ return 0;
+ }
+
+ return program;
+}
+
+void BlenderFallbackDisplayShader::bind(int width, int height)
+{
+ create_shader_if_needed();
+
+ if (!shader_program_) {
+ return;
+ }
+
+ glUseProgram(shader_program_);
+ glUniform1i(image_texture_location_, 0);
+ glUniform2f(fullscreen_location_, width, height);
+}
+
+void BlenderFallbackDisplayShader::unbind()
+{
+}
+
+uint BlenderFallbackDisplayShader::get_shader_program()
+{
+ return shader_program_;
+}
+
+void BlenderFallbackDisplayShader::create_shader_if_needed()
+{
+ if (shader_program_ || shader_compile_attempted_) {
+ return;
+ }
+
+ shader_compile_attempted_ = true;
+
+ shader_program_ = compile_fallback_shader();
+ if (!shader_program_) {
+ return;
+ }
+
+ glUseProgram(shader_program_);
+
+ image_texture_location_ = glGetUniformLocation(shader_program_, "image_texture");
+ if (image_texture_location_ < 0) {
+ LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
+ destroy_shader();
+ return;
+ }
+
+ fullscreen_location_ = glGetUniformLocation(shader_program_, "fullscreen");
+ if (fullscreen_location_ < 0) {
+ LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
+ destroy_shader();
+ return;
+ }
+}
+
+void BlenderFallbackDisplayShader::destroy_shader()
+{
+ glDeleteProgram(shader_program_);
+ shader_program_ = 0;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderDisplaySpaceShader.
+ */
+
+BlenderDisplaySpaceShader::BlenderDisplaySpaceShader(BL::RenderEngine &b_engine,
+ BL::Scene &b_scene)
+ : b_engine_(b_engine), b_scene_(b_scene)
+{
+ DCHECK(b_engine_.support_display_space_shader(b_scene_));
+}
+
+void BlenderDisplaySpaceShader::bind(int /*width*/, int /*height*/)
+{
+ b_engine_.bind_display_space_shader(b_scene_);
+}
+
+void BlenderDisplaySpaceShader::unbind()
+{
+ b_engine_.unbind_display_space_shader();
+}
+
+uint BlenderDisplaySpaceShader::get_shader_program()
+{
+ if (!shader_program_) {
+ glGetIntegerv(GL_CURRENT_PROGRAM, reinterpret_cast<int *>(&shader_program_));
+ }
+
+ if (!shader_program_) {
+ LOG(ERROR) << "Error retrieving shader program for display space shader.";
+ }
+
+ return shader_program_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderGPUDisplay.
+ */
+
+BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene)
+ : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
+{
+ /* Create context while on the main thread. */
+ gl_context_create();
+}
+
+BlenderGPUDisplay::~BlenderGPUDisplay()
+{
+ gl_resources_destroy();
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
+ int texture_width,
+ int texture_height)
+{
+ /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing
+ * the texture does not happen at the same time. This is achieved indirectly.
+ *
+ * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock.
+ * This same lock is also held when do_draw() is called, which together ensure mutual
+ * exclusion.
+ *
+ * This locking is not performed at the GPU display level, because that would cause lock
+ * inversion. */
+ if (!gl_context_enable()) {
+ return false;
+ }
+
+ if (gl_render_sync_) {
+ glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
+ }
+
+ if (!gl_texture_resources_ensure()) {
+ gl_context_disable();
+ return false;
+ }
+
+ /* Update texture dimensions if needed. */
+ if (texture_.width != texture_width || texture_.height != texture_height) {
+ glActiveTexture(GL_TEXTURE0);
+ glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+ glTexImage2D(
+ GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
+ texture_.width = texture_width;
+ texture_.height = texture_height;
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
+ * avoid undefined content. */
+ texture_.need_clear = true;
+ }
+
+ /* Update PBO dimensions if needed.
+ *
+ * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+ * at a resolution divider 1. This was we don't need to recreate graphics interoperability
+ * objects which are costly and which are tied to the specific underlying buffer size.
+ * The downside of this approach is that when graphics interoperability is not used we are
+ * sending too much data to GPU when resolution divider is not 1. */
+ /* TODO(sergey): Investigate whether keeping the PBO exact size of the texture makes non-interop
+ * mode faster. */
+ const int buffer_width = params.full_size.x;
+ const int buffer_height = params.full_size.y;
+ if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
+ const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+ glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ texture_.buffer_width = buffer_width;
+ texture_.buffer_height = buffer_height;
+ }
+
+ /* New content will be provided to the texture in one way or another, so mark this in a
+ * centralized place. */
+ texture_.need_update = true;
+
+ return true;
+}
+
+void BlenderGPUDisplay::do_update_end()
+{
+ gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ glFlush();
+
+ gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void BlenderGPUDisplay::do_copy_pixels_to_texture(
+ const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+ /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
+ * point of view than to copy data directly to the OpenGL texture.
+ *
+ * The possible downside of this approach is that it might require a higher peak memory when
+ * doing partial updates of the texture (although, in practice even partial updates might peak
+ * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
+
+ half4 *mapped_rgba_pixels = map_texture_buffer();
+ if (!mapped_rgba_pixels) {
+ return;
+ }
+
+ if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
+ pixels_height == texture_.height) {
+ const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
+ memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+ }
+ else {
+ const half4 *rgba_row = rgba_pixels;
+ half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
+ for (int y = 0; y < pixels_height;
+ ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
+ memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+ }
+ }
+
+ unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *BlenderGPUDisplay::do_map_texture_buffer()
+{
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+
+ half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
+ glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
+ if (!mapped_rgba_pixels) {
+ LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object.";
+ }
+
+ if (texture_.need_clear) {
+ const int64_t texture_width = texture_.width;
+ const int64_t texture_height = texture_.height;
+ memset(reinterpret_cast<void *>(mapped_rgba_pixels),
+ 0,
+ texture_width * texture_height * sizeof(half4));
+ texture_.need_clear = false;
+ }
+
+ return mapped_rgba_pixels;
+}
+
+void BlenderGPUDisplay::do_unmap_texture_buffer()
+{
+ glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
+{
+ DeviceGraphicsInteropDestination interop_dst;
+
+ interop_dst.buffer_width = texture_.buffer_width;
+ interop_dst.buffer_height = texture_.buffer_height;
+ interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
+
+ interop_dst.need_clear = texture_.need_clear;
+ texture_.need_clear = false;
+
+ return interop_dst;
+}
+
+void BlenderGPUDisplay::graphics_interop_activate()
+{
+ gl_context_enable();
+}
+
+void BlenderGPUDisplay::graphics_interop_deactivate()
+{
+ gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+void BlenderGPUDisplay::clear()
+{
+ texture_.need_clear = true;
+}
+
+void BlenderGPUDisplay::set_zoom(float zoom_x, float zoom_y)
+{
+ zoom_ = make_float2(zoom_x, zoom_y);
+}
+
+void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
+{
+ /* See do_update_begin() for why no locking is required here. */
+ const bool transparent = true; // TODO(sergey): Derive this from Film.
+
+ if (texture_.need_clear) {
+ /* Texture is requested to be cleared and was not yet cleared.
+ * Do early return which should be equivalent of drawing all-zero texture. */
+ return;
+ }
+
+ if (!gl_draw_resources_ensure()) {
+ return;
+ }
+
+ if (use_gl_context_) {
+ gl_context_mutex_.lock();
+ }
+
+ if (gl_upload_sync_) {
+ glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
+ }
+
+ if (transparent) {
+ glEnable(GL_BLEND);
+ glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+ }
+
+ display_shader_->bind(params.full_size.x, params.full_size.y);
+
+ glActiveTexture(GL_TEXTURE0);
+ glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+
+ /* Trick to keep sharp rendering without jagged edges on all GPUs.
+ *
+ * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
+ * in.
+ * For the render result with a resolution divider in effect we always use nearest interpolation.
+ *
+ * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
+ * the zoom level 1. The MAG filter is always NEAREST. */
+ const float zoomed_width = params.size.x * zoom_.x;
+ const float zoomed_height = params.size.y * zoom_.y;
+ if (texture_.width != params.size.x || texture_.height != params.size.y) {
+ /* Resolution divider is different from 1, force enarest interpolation. */
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ }
+ else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ }
+ else {
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ }
+
+ glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
+
+ texture_update_if_needed();
+ vertex_buffer_update(params);
+
+ /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
+ GLuint vertex_array_object;
+ glGenVertexArrays(1, &vertex_array_object);
+ glBindVertexArray(vertex_array_object);
+
+ const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
+ const int position_attribute = display_shader_->get_position_attrib_location();
+
+ glEnableVertexAttribArray(texcoord_attribute);
+ glEnableVertexAttribArray(position_attribute);
+
+ glVertexAttribPointer(
+ texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+ glVertexAttribPointer(position_attribute,
+ 2,
+ GL_FLOAT,
+ GL_FALSE,
+ 4 * sizeof(float),
+ (const GLvoid *)(sizeof(float) * 2));
+
+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+ glBindBuffer(GL_ARRAY_BUFFER, 0);
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ glDeleteVertexArrays(1, &vertex_array_object);
+
+ display_shader_->unbind();
+
+ if (transparent) {
+ glDisable(GL_BLEND);
+ }
+
+ gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ glFlush();
+
+ if (use_gl_context_) {
+ gl_context_mutex_.unlock();
+ }
+}
+
+void BlenderGPUDisplay::gl_context_create()
+{
+ /* When rendering in viewport there is no render context available via engine.
+ * Check whether own context is to be created here.
+ *
+ * NOTE: If the `b_engine_`'s context is not available, we are expected to be on a main thread
+ * here. */
+ use_gl_context_ = !RE_engine_has_render_context(
+ reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+
+ if (use_gl_context_) {
+ const bool drw_state = DRW_opengl_context_release();
+ gl_context_ = WM_opengl_context_create();
+ if (gl_context_) {
+ /* On Windows an old context is restored after creation, and subsequent release of context
+ * generates a Win32 error. Harmless for users, but annoying to have possible misleading
+ * error prints in the console. */
+#ifndef _WIN32
+ WM_opengl_context_release(gl_context_);
+#endif
+ }
+ else {
+ LOG(ERROR) << "Error creating OpenGL context.";
+ }
+
+ DRW_opengl_context_activate(drw_state);
+ }
+}
+
+bool BlenderGPUDisplay::gl_context_enable()
+{
+ if (use_gl_context_) {
+ if (!gl_context_) {
+ return false;
+ }
+ gl_context_mutex_.lock();
+ WM_opengl_context_activate(gl_context_);
+ return true;
+ }
+
+ RE_engine_render_context_enable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+ return true;
+}
+
+void BlenderGPUDisplay::gl_context_disable()
+{
+ if (use_gl_context_) {
+ if (gl_context_) {
+ WM_opengl_context_release(gl_context_);
+ gl_context_mutex_.unlock();
+ }
+ return;
+ }
+
+ RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+}
+
+void BlenderGPUDisplay::gl_context_dispose()
+{
+ if (gl_context_) {
+ const bool drw_state = DRW_opengl_context_release();
+
+ WM_opengl_context_activate(gl_context_);
+ WM_opengl_context_dispose(gl_context_);
+
+ DRW_opengl_context_activate(drw_state);
+ }
+}
+
+bool BlenderGPUDisplay::gl_draw_resources_ensure()
+{
+ if (!texture_.gl_id) {
+ /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
+ * can not continue. Note that this is not an unrecoverable error, so once the texture is known
+ * we will come back here and create all the GPU resources needed for draw. */
+ return false;
+ }
+
+ if (gl_draw_resource_creation_attempted_) {
+ return gl_draw_resources_created_;
+ }
+ gl_draw_resource_creation_attempted_ = true;
+
+ if (!vertex_buffer_) {
+ glGenBuffers(1, &vertex_buffer_);
+ if (!vertex_buffer_) {
+ LOG(ERROR) << "Error creating vertex buffer.";
+ return false;
+ }
+ }
+
+ gl_draw_resources_created_ = true;
+
+ return true;
+}
+
+void BlenderGPUDisplay::gl_resources_destroy()
+{
+ gl_context_enable();
+
+ if (vertex_buffer_ != 0) {
+ glDeleteBuffers(1, &vertex_buffer_);
+ }
+
+ if (texture_.gl_pbo_id) {
+ glDeleteBuffers(1, &texture_.gl_pbo_id);
+ texture_.gl_pbo_id = 0;
+ }
+
+ if (texture_.gl_id) {
+ glDeleteTextures(1, &texture_.gl_id);
+ texture_.gl_id = 0;
+ }
+
+ gl_context_disable();
+
+ gl_context_dispose();
+}
+
+bool BlenderGPUDisplay::gl_texture_resources_ensure()
+{
+ if (texture_.creation_attempted) {
+ return texture_.is_created;
+ }
+ texture_.creation_attempted = true;
+
+ DCHECK(!texture_.gl_id);
+ DCHECK(!texture_.gl_pbo_id);
+
+ /* Create texture. */
+ glGenTextures(1, &texture_.gl_id);
+ if (!texture_.gl_id) {
+ LOG(ERROR) << "Error creating texture.";
+ return false;
+ }
+
+ /* Configure the texture. */
+ glActiveTexture(GL_TEXTURE0);
+ glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ /* Create PBO for the texture. */
+ glGenBuffers(1, &texture_.gl_pbo_id);
+ if (!texture_.gl_pbo_id) {
+ LOG(ERROR) << "Error creating texture pixel buffer object.";
+ return false;
+ }
+
+ /* Creation finished with a success. */
+ texture_.is_created = true;
+
+ return true;
+}
+
+void BlenderGPUDisplay::texture_update_if_needed()
+{
+ if (!texture_.need_update) {
+ return;
+ }
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+ glTexSubImage2D(
+ GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ texture_.need_update = false;
+}
+
+void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
+{
+ /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+ * rendered. */
+ glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+ float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+ if (!vpointer) {
+ return;
+ }
+
+ vpointer[0] = 0.0f;
+ vpointer[1] = 0.0f;
+ vpointer[2] = params.offset.x;
+ vpointer[3] = params.offset.y;
+
+ vpointer[4] = 1.0f;
+ vpointer[5] = 0.0f;
+ vpointer[6] = (float)params.size.x + params.offset.x;
+ vpointer[7] = params.offset.y;
+
+ vpointer[8] = 1.0f;
+ vpointer[9] = 1.0f;
+ vpointer[10] = (float)params.size.x + params.offset.x;
+ vpointer[11] = (float)params.size.y + params.offset.y;
+
+ vpointer[12] = 0.0f;
+ vpointer[13] = 1.0f;
+ vpointer[14] = params.offset.x;
+ vpointer[15] = (float)params.size.y + params.offset.y;
+
+ glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h
new file mode 100644
index 00000000000..89420567037
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "MEM_guardedalloc.h"
+
+#include "RNA_blender_cpp.h"
+
+#include "render/gpu_display.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Base class of shader used for GPU display rendering. */
+class BlenderDisplayShader {
+ public:
+ static constexpr const char *position_attribute_name = "pos";
+ static constexpr const char *tex_coord_attribute_name = "texCoord";
+
+ /* Create shader implementation suitable for the given render engine and scene configuration. */
+ static unique_ptr<BlenderDisplayShader> create(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+ BlenderDisplayShader() = default;
+ virtual ~BlenderDisplayShader() = default;
+
+ virtual void bind(int width, int height) = 0;
+ virtual void unbind() = 0;
+
+ /* Get attribute location for position and texture coordinate respectively.
+ * NOTE: The shader needs to be bound to have access to those. */
+ virtual int get_position_attrib_location();
+ virtual int get_tex_coord_attrib_location();
+
+ protected:
+ /* Get program of this display shader.
+ * NOTE: The shader needs to be bound to have access to this. */
+ virtual uint get_shader_program() = 0;
+
+ /* Cached values of various OpenGL resources. */
+ int position_attribute_location_ = -1;
+ int tex_coord_attribute_location_ = -1;
+};
+
+/* Implementation of display rendering shader used in the case when render engine does not support
+ * display space shader. */
+class BlenderFallbackDisplayShader : public BlenderDisplayShader {
+ public:
+ virtual void bind(int width, int height) override;
+ virtual void unbind() override;
+
+ protected:
+ virtual uint get_shader_program() override;
+
+ void create_shader_if_needed();
+ void destroy_shader();
+
+ uint shader_program_ = 0;
+ int image_texture_location_ = -1;
+ int fullscreen_location_ = -1;
+
+ /* Shader compilation attempted. Which means, that if the shader program is 0 then compilation or
+ * linking has failed. Do not attempt to re-compile the shader. */
+ bool shader_compile_attempted_ = false;
+};
+
+class BlenderDisplaySpaceShader : public BlenderDisplayShader {
+ public:
+ BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+ virtual void bind(int width, int height) override;
+ virtual void unbind() override;
+
+ protected:
+ virtual uint get_shader_program() override;
+
+ BL::RenderEngine b_engine_;
+ BL::Scene &b_scene_;
+
+ /* Cached values of various OpenGL resources. */
+ uint shader_program_ = 0;
+};
+
+/* GPU display implementation which is specific for Blender viewport integration. */
+class BlenderGPUDisplay : public GPUDisplay {
+ public:
+ BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+ ~BlenderGPUDisplay();
+
+ virtual void graphics_interop_activate() override;
+ virtual void graphics_interop_deactivate() override;
+
+ virtual void clear() override;
+
+ void set_zoom(float zoom_x, float zoom_y);
+
+ protected:
+ virtual bool do_update_begin(const GPUDisplayParams &params,
+ int texture_width,
+ int texture_height) override;
+ virtual void do_update_end() override;
+
+ virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+ int texture_x,
+ int texture_y,
+ int pixels_width,
+ int pixels_height) override;
+ virtual void do_draw(const GPUDisplayParams &params) override;
+
+ virtual half4 *do_map_texture_buffer() override;
+ virtual void do_unmap_texture_buffer() override;
+
+ virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override;
+
+ /* Helper function which allocates new GPU context. */
+ void gl_context_create();
+ bool gl_context_enable();
+ void gl_context_disable();
+ void gl_context_dispose();
+
+ /* Make sure texture is allocated and its initial configuration is performed. */
+ bool gl_texture_resources_ensure();
+
+ /* Ensure all runtime GPU resources needed for drawing are allocated.
+ * Returns true if all resources needed for drawing are available. */
+ bool gl_draw_resources_ensure();
+
+ /* Destroy all GPU resources which are being used by this object. */
+ void gl_resources_destroy();
+
+ /* Update GPU texture dimensions and content if needed (new pixel data was provided).
+ *
+ * NOTE: The texture needs to be bound. */
+ void texture_update_if_needed();
+
+ /* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
+ * This buffer is used to render texture in the viewport.
+ *
+ * NOTE: The buffer needs to be bound. */
+ void vertex_buffer_update(const GPUDisplayParams &params);
+
+ BL::RenderEngine b_engine_;
+
+ /* OpenGL context which is used the render engine doesn't have its own. */
+ void *gl_context_ = nullptr;
+ /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create
+ * its own context. */
+ bool use_gl_context_ = false;
+ /* Mutex used to guard the `gl_context_`. */
+ thread_mutex gl_context_mutex_;
+
+ /* Texture which contains pixels of the render result. */
+ struct {
+ /* Indicates whether texture creation was attempted and succeeded.
+ * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
+ * misconfiguration. */
+ bool creation_attempted = false;
+ bool is_created = false;
+
+ /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
+ * pixels to it.
+ *
+ * NOTE: Allocated on the engine's context. */
+ uint gl_id = 0;
+ uint gl_pbo_id = 0;
+
+ /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
+ * and new data is to be uploaded to the GPU. */
+ bool need_update = false;
+
+ /* Content of the texture is to be filled with zeroes. */
+ std::atomic<bool> need_clear = true;
+
+ /* Dimensions of the texture in pixels. */
+ int width = 0;
+ int height = 0;
+
+ /* Dimensions of the underlying PBO. */
+ int buffer_width = 0;
+ int buffer_height = 0;
+ } texture_;
+
+ unique_ptr<BlenderDisplayShader> display_shader_;
+
+ /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
+ * their re-creation on failure on every redraw. */
+ bool gl_draw_resource_creation_attempted_ = false;
+ bool gl_draw_resources_created_ = false;
+
+ /* Vertex buffer which hold vertices of a triangle fan which is textures with the texture
+ * holding the render result. */
+ uint vertex_buffer_ = 0;
+
+ void *gl_render_sync_ = nullptr;
+ void *gl_upload_sync_ = nullptr;
+
+ float2 zoom_ = make_float2(1.0f, 1.0f);
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
index 542028f4b2f..4df1e720dde 100644
--- a/intern/cycles/blender/blender_light.cpp
+++ b/intern/cycles/blender/blender_light.cpp
@@ -125,17 +125,10 @@ void BlenderSync::sync_light(BL::Object &b_parent,
light->set_shader(static_cast<Shader *>(used_shaders[0]));
/* shadow */
- PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
light->set_cast_shadow(get_boolean(clight, "cast_shadow"));
light->set_use_mis(get_boolean(clight, "use_multiple_importance_sampling"));
- int samples = get_int(clight, "samples");
- if (get_boolean(cscene, "use_square_samples"))
- light->set_samples(samples * samples);
- else
- light->set_samples(samples);
-
light->set_max_bounces(get_int(clight, "max_bounces"));
if (b_ob_info.real_object != b_ob_info.iter_object) {
@@ -155,10 +148,12 @@ void BlenderSync::sync_light(BL::Object &b_parent,
/* visibility */
uint visibility = object_ray_visibility(b_ob_info.real_object);
+ light->set_use_camera((visibility & PATH_RAY_CAMERA) != 0);
light->set_use_diffuse((visibility & PATH_RAY_DIFFUSE) != 0);
light->set_use_glossy((visibility & PATH_RAY_GLOSSY) != 0);
light->set_use_transmission((visibility & PATH_RAY_TRANSMIT) != 0);
light->set_use_scatter((visibility & PATH_RAY_VOLUME_SCATTER) != 0);
+ light->set_is_shadow_catcher(b_ob_info.real_object.is_shadow_catcher());
/* tag */
light->tag_update(scene);
@@ -169,7 +164,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
BL::World b_world = b_scene.world();
if (b_world) {
- PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
@@ -197,12 +191,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
/* force enable light again when world is resynced */
light->set_is_enabled(true);
- int samples = get_int(cworld, "samples");
- if (get_boolean(cscene, "use_square_samples"))
- light->set_samples(samples * samples);
- else
- light->set_samples(samples);
-
light->tag_update(scene);
light_map.set_recalc(b_world);
}
@@ -211,7 +199,7 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
world_map = b_world.ptr.data;
world_recalc = false;
- viewport_parameters = BlenderViewportParameters(b_v3d);
+ viewport_parameters = BlenderViewportParameters(b_v3d, use_developer_ui);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 22d6edeb099..95da4a2df84 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -568,7 +568,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
/* object loop */
bool cancel = false;
bool use_portal = false;
- const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights;
+ const bool show_lights = BlenderViewportParameters(b_v3d, use_developer_ui).use_scene_lights;
BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
BL::Depsgraph::object_instances_iterator b_instance_iter;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 6e06b6a468f..694d8454422 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -45,10 +45,6 @@
# include <OSL/oslquery.h>
#endif
-#ifdef WITH_OPENCL
-# include "device/device_intern.h"
-#endif
-
CCL_NAMESPACE_BEGIN
namespace {
@@ -72,12 +68,10 @@ PyObject *pyunicode_from_string(const char *str)
/* Synchronize debug flags from a given Blender scene.
* Return truth when device list needs invalidation.
*/
-bool debug_flags_sync_from_scene(BL::Scene b_scene)
+static void debug_flags_sync_from_scene(BL::Scene b_scene)
{
DebugFlagsRef flags = DebugFlags();
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
- /* Backup some settings for comparison. */
- DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
/* Synchronize shared flags. */
flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
/* Synchronize CPU flags. */
@@ -87,50 +81,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
- flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
- flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
/* Synchronize OptiX flags. */
- flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
- flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
- /* Synchronize OpenCL device type. */
- switch (get_enum(cscene, "debug_opencl_device_type")) {
- case 0:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
- break;
- case 1:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ALL;
- break;
- case 2:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
- break;
- case 3:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_CPU;
- break;
- case 4:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_GPU;
- break;
- case 5:
- flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
- break;
- }
- /* Synchronize other OpenCL flags. */
- flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
- flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit")) * 1024 * 1024;
- return flags.opencl.device_type != opencl_device_type;
+ flags.optix.use_debug = get_boolean(cscene, "debug_use_optix_debug");
}
/* Reset debug flags to default values.
* Return truth when device list needs invalidation.
*/
-bool debug_flags_reset()
+static void debug_flags_reset()
{
DebugFlagsRef flags = DebugFlags();
- /* Backup some settings for comparison. */
- DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
flags.reset();
- return flags.opencl.device_type != opencl_device_type;
}
} /* namespace */
@@ -175,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
static PyObject *init_func(PyObject * /*self*/, PyObject *args)
{
- PyObject *path, *user_path;
+ PyObject *path, *user_path, *temp_path;
int headless;
- if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
- return NULL;
+ if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
+ return nullptr;
}
- PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+ PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
path_init(PyC_UnicodeAsByte(path, &path_coerce),
- PyC_UnicodeAsByte(user_path, &user_path_coerce));
+ PyC_UnicodeAsByte(user_path, &user_path_coerce),
+ PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
Py_XDECREF(path_coerce);
Py_XDECREF(user_path_coerce);
+ Py_XDECREF(temp_path_coerce);
BlenderSession::headless = headless;
@@ -299,6 +264,50 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args)
Py_RETURN_NONE;
}
+static PyObject *render_frame_finish_func(PyObject * /*self*/, PyObject *args)
+{
+ PyObject *pysession;
+
+ if (!PyArg_ParseTuple(args, "O", &pysession)) {
+ return nullptr;
+ }
+
+ BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession);
+
+ /* Allow Blender to execute other Python scripts. */
+ python_thread_state_save(&session->python_thread_state);
+
+ session->render_frame_finish();
+
+ python_thread_state_restore(&session->python_thread_state);
+
+ Py_RETURN_NONE;
+}
+
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+{
+ PyObject *py_session, *py_graph, *py_screen, *py_space_image;
+
+ if (!PyArg_ParseTuple(args, "OOOO", &py_session, &py_graph, &py_screen, &py_space_image)) {
+ return nullptr;
+ }
+
+ BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(py_session);
+
+ ID *b_screen = (ID *)PyLong_AsVoidPtr(py_screen);
+
+ PointerRNA b_space_image_ptr;
+ RNA_pointer_create(b_screen,
+ &RNA_SpaceImageEditor,
+ pylong_as_voidptr_typesafe(py_space_image),
+ &b_space_image_ptr);
+ BL::SpaceImageEditor b_space_image(b_space_image_ptr);
+
+ session->draw(b_space_image);
+
+ Py_RETURN_NONE;
+}
+
/* pixel_array and result passed as pointers */
static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
{
@@ -336,7 +345,7 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
Py_RETURN_NONE;
}
-static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args)
{
PyObject *pysession, *pygraph, *pyv3d, *pyrv3d;
@@ -350,7 +359,7 @@ static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
int viewport[4];
glGetIntegerv(GL_VIEWPORT, viewport);
- session->draw(viewport[2], viewport[3]);
+ session->view_draw(viewport[2], viewport[3]);
}
Py_RETURN_NONE;
@@ -697,40 +706,6 @@ static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
return pyunicode_from_string(system_info.c_str());
}
-#ifdef WITH_OPENCL
-static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
-{
- VLOG(2) << "Disabling OpenCL platform.";
- DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
- Py_RETURN_NONE;
-}
-
-static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
-{
- PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence");
- if (sequence == NULL) {
- Py_RETURN_FALSE;
- }
-
- vector<string> parameters;
- for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) {
- PyObject *item = PySequence_Fast_GET_ITEM(sequence, i);
- PyObject *item_as_string = PyObject_Str(item);
- const char *parameter_string = PyUnicode_AsUTF8(item_as_string);
- parameters.push_back(parameter_string);
- Py_DECREF(item_as_string);
- }
- Py_DECREF(sequence);
-
- if (device_opencl_compile_kernel(parameters)) {
- Py_RETURN_TRUE;
- }
- else {
- Py_RETURN_FALSE;
- }
-}
-#endif
-
static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepaths)
{
if (PyUnicode_Check(pyfilepaths)) {
@@ -762,6 +737,10 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat
static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
{
+#if 1
+ (void)args;
+ (void)keywords;
+#else
static const char *keyword_list[] = {
"preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
PyObject *pypreferences, *pyscene, *pyviewlayer;
@@ -835,7 +814,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
}
/* Create denoiser. */
- Denoiser denoiser(device);
+ DenoiserPipeline denoiser(device);
denoiser.params = params;
denoiser.input = input;
denoiser.output = output;
@@ -852,6 +831,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
return NULL;
}
+#endif
Py_RETURN_NONE;
}
@@ -903,10 +883,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
BL::Scene b_scene(sceneptr);
- if (debug_flags_sync_from_scene(b_scene)) {
- VLOG(2) << "Tagging device list for update.";
- Device::tag_update();
- }
+ debug_flags_sync_from_scene(b_scene);
VLOG(2) << "Debug flags set to:\n" << DebugFlags();
@@ -917,10 +894,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/)
{
- if (debug_flags_reset()) {
- VLOG(2) << "Tagging device list for update.";
- Device::tag_update();
- }
+ debug_flags_reset();
if (debug_flags_set) {
VLOG(2) << "Debug flags reset to:\n" << DebugFlags();
debug_flags_set = false;
@@ -928,84 +902,6 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
Py_RETURN_NONE;
}
-static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
-{
- int num_resumable_chunks, current_resumable_chunk;
- if (!PyArg_ParseTuple(args, "ii", &num_resumable_chunks, &current_resumable_chunk)) {
- Py_RETURN_NONE;
- }
-
- if (num_resumable_chunks <= 0) {
- fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
- abort();
- Py_RETURN_NONE;
- }
- if (current_resumable_chunk < 1 || current_resumable_chunk > num_resumable_chunks) {
- fprintf(stderr, "Cycles: Bad value for current resumable chunk number.\n");
- abort();
- Py_RETURN_NONE;
- }
-
- VLOG(1) << "Initialized resumable render: "
- << "num_resumable_chunks=" << num_resumable_chunks << ", "
- << "current_resumable_chunk=" << current_resumable_chunk;
- BlenderSession::num_resumable_chunks = num_resumable_chunks;
- BlenderSession::current_resumable_chunk = current_resumable_chunk;
-
- printf("Cycles: Will render chunk %d of %d\n", current_resumable_chunk, num_resumable_chunks);
-
- Py_RETURN_NONE;
-}
-
-static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
-{
- int num_chunks, start_chunk, end_chunk;
- if (!PyArg_ParseTuple(args, "iii", &num_chunks, &start_chunk, &end_chunk)) {
- Py_RETURN_NONE;
- }
-
- if (num_chunks <= 0) {
- fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
- abort();
- Py_RETURN_NONE;
- }
- if (start_chunk < 1 || start_chunk > num_chunks) {
- fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
- abort();
- Py_RETURN_NONE;
- }
- if (end_chunk < 1 || end_chunk > num_chunks) {
- fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
- abort();
- Py_RETURN_NONE;
- }
- if (start_chunk > end_chunk) {
- fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
- abort();
- Py_RETURN_NONE;
- }
-
- VLOG(1) << "Initialized resumable render: "
- << "num_resumable_chunks=" << num_chunks << ", "
- << "start_resumable_chunk=" << start_chunk << "end_resumable_chunk=" << end_chunk;
- BlenderSession::num_resumable_chunks = num_chunks;
- BlenderSession::start_resumable_chunk = start_chunk;
- BlenderSession::end_resumable_chunk = end_chunk;
-
- printf("Cycles: Will render chunks %d to %d of %d\n", start_chunk, end_chunk, num_chunks);
-
- Py_RETURN_NONE;
-}
-
-static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/)
-{
- VLOG(1) << "Clear resumable render";
- BlenderSession::num_resumable_chunks = 0;
- BlenderSession::current_resumable_chunk = 0;
-
- Py_RETURN_NONE;
-}
-
static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/)
{
BlenderSession::print_render_stats = true;
@@ -1015,16 +911,14 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
{
vector<DeviceType> device_types = Device::available_types();
- bool has_cuda = false, has_optix = false, has_opencl = false;
+ bool has_cuda = false, has_optix = false;
foreach (DeviceType device_type, device_types) {
has_cuda |= (device_type == DEVICE_CUDA);
has_optix |= (device_type == DEVICE_OPTIX);
- has_opencl |= (device_type == DEVICE_OPENCL);
}
- PyObject *list = PyTuple_New(3);
+ PyObject *list = PyTuple_New(2);
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
- PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
return list;
}
@@ -1044,9 +938,6 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
if (override == "CPU") {
BlenderSession::device_override = DEVICE_MASK_CPU;
}
- else if (override == "OPENCL") {
- BlenderSession::device_override = DEVICE_MASK_OPENCL;
- }
else if (override == "CUDA") {
BlenderSession::device_override = DEVICE_MASK_CUDA;
}
@@ -1072,8 +963,10 @@ static PyMethodDef methods[] = {
{"create", create_func, METH_VARARGS, ""},
{"free", free_func, METH_O, ""},
{"render", render_func, METH_VARARGS, ""},
- {"bake", bake_func, METH_VARARGS, ""},
+ {"render_frame_finish", render_frame_finish_func, METH_VARARGS, ""},
{"draw", draw_func, METH_VARARGS, ""},
+ {"bake", bake_func, METH_VARARGS, ""},
+ {"view_draw", view_draw_func, METH_VARARGS, ""},
{"sync", sync_func, METH_VARARGS, ""},
{"reset", reset_func, METH_VARARGS, ""},
#ifdef WITH_OSL
@@ -1082,10 +975,6 @@ static PyMethodDef methods[] = {
#endif
{"available_devices", available_devices_func, METH_VARARGS, ""},
{"system_info", system_info_func, METH_NOARGS, ""},
-#ifdef WITH_OPENCL
- {"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
- {"opencl_compile", opencl_compile_func, METH_VARARGS, ""},
-#endif
/* Standalone denoising */
{"denoise", (PyCFunction)denoise_func, METH_VARARGS | METH_KEYWORDS, ""},
@@ -1098,11 +987,6 @@ static PyMethodDef methods[] = {
/* Statistics. */
{"enable_print_stats", enable_print_stats_func, METH_NOARGS, ""},
- /* Resumable render */
- {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
- {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
- {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""},
-
/* Compute Device selection */
{"get_device_types", get_device_types_func, METH_VARARGS, ""},
{"set_device_override", set_device_override_func, METH_O, ""},
@@ -1153,14 +1037,6 @@ void *CCL_python_module_init()
PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
#endif
-#ifdef WITH_NETWORK
- PyModule_AddObject(mod, "with_network", Py_True);
- Py_INCREF(Py_True);
-#else /* WITH_NETWORK */
- PyModule_AddObject(mod, "with_network", Py_False);
- Py_INCREF(Py_False);
-#endif /* WITH_NETWORK */
-
#ifdef WITH_EMBREE
PyModule_AddObject(mod, "with_embree", Py_True);
Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 29de886e4ff..d65d89a7ddd 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -38,9 +38,11 @@
#include "util/util_hash.h"
#include "util/util_logging.h"
#include "util/util_murmurhash.h"
+#include "util/util_path.h"
#include "util/util_progress.h"
#include "util/util_time.h"
+#include "blender/blender_gpu_display.h"
#include "blender/blender_session.h"
#include "blender/blender_sync.h"
#include "blender/blender_util.h"
@@ -49,10 +51,6 @@ CCL_NAMESPACE_BEGIN
DeviceTypeMask BlenderSession::device_override = DEVICE_MASK_ALL;
bool BlenderSession::headless = false;
-int BlenderSession::num_resumable_chunks = 0;
-int BlenderSession::current_resumable_chunk = 0;
-int BlenderSession::start_resumable_chunk = 0;
-int BlenderSession::end_resumable_chunk = 0;
bool BlenderSession::print_render_stats = false;
BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
@@ -103,7 +101,9 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
width(width),
height(height),
preview_osl(false),
- python_thread_state(NULL)
+ python_thread_state(NULL),
+ use_developer_ui(b_userpref.experimental().use_cycles_debug() &&
+ b_userpref.view().show_developer_ui())
{
/* 3d view render */
background = false;
@@ -119,10 +119,10 @@ BlenderSession::~BlenderSession()
void BlenderSession::create_session()
{
- SessionParams session_params = BlenderSync::get_session_params(
+ const SessionParams session_params = BlenderSync::get_session_params(
b_engine, b_userpref, b_scene, background);
- SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
- bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+ const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+ const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
/* reset status/progress */
last_status = "";
@@ -131,20 +131,18 @@ void BlenderSession::create_session()
start_resize_time = 0.0;
/* create session */
- session = new Session(session_params);
- session->scene = scene;
+ session = new Session(session_params, scene_params);
session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this));
session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
session->set_pause(session_pause);
/* create scene */
- scene = new Scene(scene_params, session->device);
+ scene = session->scene;
scene->name = b_scene.name();
- session->scene = scene;
-
/* create sync */
- sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+ sync = new BlenderSync(
+ b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
BL::Object b_camera_override(b_engine.camera_override());
if (b_v3d) {
sync->sync_view(b_v3d, b_rv3d, width, height);
@@ -154,13 +152,25 @@ void BlenderSession::create_session()
}
/* set buffer parameters */
- BufferParams buffer_params = BlenderSync::get_buffer_params(
- b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
- session->reset(buffer_params, session_params.samples);
-
- b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+ const BufferParams buffer_params = BlenderSync::get_buffer_params(
+ b_v3d, b_rv3d, scene->camera, width, height);
+ session->reset(session_params, buffer_params);
+
+ /* Create GPU display. */
+ if (!b_engine.is_preview() && !headless) {
+ unique_ptr<BlenderGPUDisplay> gpu_display = make_unique<BlenderGPUDisplay>(b_engine, b_scene);
+ gpu_display_ = gpu_display.get();
+ session->set_gpu_display(move(gpu_display));
+ }
- update_resumable_tile_manager(session_params.samples);
+ /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform
+ * engine that no tracking of the tiles state is needed.
+ * The offline rendering will make a decision when tile is being written. The penalty of asking
+ * the engine to keep track of tiles state is minimal, so there is nothing to worry about here
+ * about possible single-tiled final render. */
+ if (!b_engine.is_preview() && !b_v3d) {
+ b_engine.use_highlight_tiles(true);
+ }
}
void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph)
@@ -202,9 +212,9 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
return;
}
- SessionParams session_params = BlenderSync::get_session_params(
+ const SessionParams session_params = BlenderSync::get_session_params(
b_engine, b_userpref, b_scene, background);
- SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+ const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
if (scene->params.modified(scene_params) || session->params.modified(session_params) ||
!this->b_render.use_persistent_data()) {
@@ -220,8 +230,6 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
session->progress.reset();
- session->tile_manager.set_tile_order(session_params.tile_order);
-
/* peak memory usage should show current render peak, not peak for all renders
* made by this render session
*/
@@ -230,7 +238,8 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
if (is_new_session) {
/* Sync object should be re-created for new scene. */
delete sync;
- sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+ sync = new BlenderSync(
+ b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
}
else {
/* Sync recalculations to do just the required updates. */
@@ -242,103 +251,85 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
- BufferParams buffer_params = BlenderSync::get_buffer_params(b_null_space_view3d,
- b_null_region_view3d,
- scene->camera,
- width,
- height,
- session_params.denoising.use);
- session->reset(buffer_params, session_params.samples);
-
- b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+ const BufferParams buffer_params = BlenderSync::get_buffer_params(
+ b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+ session->reset(session_params, buffer_params);
/* reset time */
start_resize_time = 0.0;
+
+ {
+ thread_scoped_lock lock(draw_state_.mutex);
+ draw_state_.last_pass_index = -1;
+ }
}
void BlenderSession::free_session()
{
- session->cancel();
+ if (session) {
+ session->cancel(true);
+ }
delete sync;
+ sync = nullptr;
+
delete session;
+ session = nullptr;
}
-static ShaderEvalType get_shader_type(const string &pass_type)
+void BlenderSession::read_render_tile()
{
- const char *shader_type = pass_type.c_str();
+ const int2 tile_offset = session->get_render_tile_offset();
+ const int2 tile_size = session->get_render_tile_size();
- /* data passes */
- if (strcmp(shader_type, "NORMAL") == 0)
- return SHADER_EVAL_NORMAL;
- else if (strcmp(shader_type, "UV") == 0)
- return SHADER_EVAL_UV;
- else if (strcmp(shader_type, "ROUGHNESS") == 0)
- return SHADER_EVAL_ROUGHNESS;
- else if (strcmp(shader_type, "DIFFUSE_COLOR") == 0)
- return SHADER_EVAL_DIFFUSE_COLOR;
- else if (strcmp(shader_type, "GLOSSY_COLOR") == 0)
- return SHADER_EVAL_GLOSSY_COLOR;
- else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
- return SHADER_EVAL_TRANSMISSION_COLOR;
- else if (strcmp(shader_type, "EMIT") == 0)
- return SHADER_EVAL_EMISSION;
+ /* get render result */
+ BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+ tile_offset.y,
+ tile_size.x,
+ tile_size.y,
+ b_rlay_name.c_str(),
+ b_rview_name.c_str());
- /* light passes */
- else if (strcmp(shader_type, "AO") == 0)
- return SHADER_EVAL_AO;
- else if (strcmp(shader_type, "COMBINED") == 0)
- return SHADER_EVAL_COMBINED;
- else if (strcmp(shader_type, "SHADOW") == 0)
- return SHADER_EVAL_SHADOW;
- else if (strcmp(shader_type, "DIFFUSE") == 0)
- return SHADER_EVAL_DIFFUSE;
- else if (strcmp(shader_type, "GLOSSY") == 0)
- return SHADER_EVAL_GLOSSY;
- else if (strcmp(shader_type, "TRANSMISSION") == 0)
- return SHADER_EVAL_TRANSMISSION;
+ /* can happen if the intersected rectangle gives 0 width or height */
+ if (b_rr.ptr.data == NULL) {
+ return;
+ }
- /* extra */
- else if (strcmp(shader_type, "ENVIRONMENT") == 0)
- return SHADER_EVAL_ENVIRONMENT;
+ BL::RenderResult::layers_iterator b_single_rlay;
+ b_rr.layers.begin(b_single_rlay);
- else
- return SHADER_EVAL_BAKE;
-}
+ /* layer will be missing if it was disabled in the UI */
+ if (b_single_rlay == b_rr.layers.end())
+ return;
-static BL::RenderResult begin_render_result(BL::RenderEngine &b_engine,
- int x,
- int y,
- int w,
- int h,
- const char *layername,
- const char *viewname)
-{
- return b_engine.begin_result(x, y, w, h, layername, viewname);
-}
+ BL::RenderLayer b_rlay = *b_single_rlay;
-static void end_render_result(BL::RenderEngine &b_engine,
- BL::RenderResult &b_rr,
- bool cancel,
- bool highlight,
- bool do_merge_results)
-{
- b_engine.end_result(b_rr, (int)cancel, (int)highlight, (int)do_merge_results);
+ vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+ /* Copy each pass.
+ * TODO:copy only the required ones for better performance? */
+ for (BL::RenderPass &b_pass : b_rlay.passes) {
+ session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
+ }
}
-void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
- bool do_update_only,
- bool do_read_only,
- bool highlight)
+void BlenderSession::write_render_tile()
{
- int x = rtile.x - session->tile_manager.params.full_x;
- int y = rtile.y - session->tile_manager.params.full_y;
- int w = rtile.w;
- int h = rtile.h;
+ const int2 tile_offset = session->get_render_tile_offset();
+ const int2 tile_size = session->get_render_tile_size();
+
+ const string_view render_layer_name = session->get_render_tile_layer();
+ const string_view render_view_name = session->get_render_tile_view();
+
+ b_engine.tile_highlight_clear_all();
/* get render result */
- BL::RenderResult b_rr = begin_render_result(
- b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
+ BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+ tile_offset.y,
+ tile_size.x,
+ tile_size.y,
+ render_layer_name.c_str(),
+ render_view_name.c_str());
/* can happen if the intersected rectangle gives 0 width or height */
if (b_rr.ptr.data == NULL) {
@@ -349,64 +340,34 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
b_rr.layers.begin(b_single_rlay);
/* layer will be missing if it was disabled in the UI */
- if (b_single_rlay == b_rr.layers.end())
+ if (b_single_rlay == b_rr.layers.end()) {
return;
+ }
BL::RenderLayer b_rlay = *b_single_rlay;
- if (do_read_only) {
- /* copy each pass */
- for (BL::RenderPass &b_pass : b_rlay.passes) {
- /* find matching pass type */
- PassType pass_type = BlenderSync::get_pass_type(b_pass);
- int components = b_pass.channels();
-
- rtile.buffers->set_pass_rect(
- pass_type, components, (float *)b_pass.rect(), rtile.num_samples);
- }
-
- end_render_result(b_engine, b_rr, false, false, false);
- }
- else if (do_update_only) {
- /* Sample would be zero at initial tile update, which is only needed
- * to tag tile form blender side as IN PROGRESS for proper highlight
- * no buffers should be sent to blender yet. For denoise we also
- * keep showing the noisy buffers until denoise is done. */
- bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
-
- if (merge) {
- update_render_result(b_rlay, rtile);
- }
+ write_render_result(b_rlay);
- end_render_result(b_engine, b_rr, true, highlight, merge);
- }
- else {
- /* Write final render result. */
- write_render_result(b_rlay, rtile);
- end_render_result(b_engine, b_rr, false, false, true);
- }
+ b_engine.end_result(b_rr, true, false, true);
}
-void BlenderSession::read_render_tile(RenderTile &rtile)
+void BlenderSession::update_render_tile()
{
- do_write_update_render_tile(rtile, false, true, false);
-}
+ if (!session->has_multiple_render_tiles()) {
+ /* Don't highlight full-frame tile. */
+ return;
+ }
-void BlenderSession::write_render_tile(RenderTile &rtile)
-{
- do_write_update_render_tile(rtile, false, false, false);
+ const int2 tile_offset = session->get_render_tile_offset();
+ const int2 tile_size = session->get_render_tile_size();
+
+ b_engine.tile_highlight_clear_all();
+ b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true);
}
-void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
+void BlenderSession::full_buffer_written(string_view filename)
{
- /* use final write for preview renders, otherwise render result wouldn't be
- * be updated in blender side
- * would need to be investigated a bit further, but for now shall be fine
- */
- if (!b_engine.is_preview())
- do_write_update_render_tile(rtile, true, false, highlight);
- else
- do_write_update_render_tile(rtile, false, false, false);
+ full_buffer_files_.emplace_back(filename);
}
static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest)
@@ -430,12 +391,15 @@ void BlenderSession::stamp_view_layer_metadata(Scene *scene, const string &view_
to_string(session->params.samples).c_str());
/* Store ranged samples information. */
+ /* TODO(sergey): Need to bring this information back. */
+#if 0
if (session->tile_manager.range_num_samples != -1) {
b_rr.stamp_data_add_field((prefix + "range_start_sample").c_str(),
to_string(session->tile_manager.range_start_sample).c_str());
b_rr.stamp_data_add_field((prefix + "range_num_samples").c_str(),
to_string(session->tile_manager.range_num_samples).c_str());
}
+#endif
/* Write cryptomatte metadata. */
if (scene->film->get_cryptomatte_passes() & CRYPT_OBJECT) {
@@ -475,38 +439,44 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
}
/* set callback to write out render results */
- session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
- session->update_render_tile_cb = function_bind(
- &BlenderSession::update_render_tile, this, _1, _2);
+ session->write_render_tile_cb = [&]() { write_render_tile(); };
+
+ /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender
+ * side. */
+ /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */
+ if (b_engine.is_preview()) {
+ session->update_render_tile_cb = [&]() { write_render_tile(); };
+ }
+ else {
+ session->update_render_tile_cb = [&]() { update_render_tile(); };
+ }
+
+ session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); };
BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
/* get buffer parameters */
- SessionParams session_params = BlenderSync::get_session_params(
- b_engine, b_userpref, b_scene, background, b_view_layer);
+ const SessionParams session_params = BlenderSync::get_session_params(
+ b_engine, b_userpref, b_scene, background);
BufferParams buffer_params = BlenderSync::get_buffer_params(
- b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
+ b_v3d, b_rv3d, scene->camera, width, height);
/* temporary render result to find needed passes and views */
- BL::RenderResult b_rr = begin_render_result(
- b_engine, 0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
+ BL::RenderResult b_rr = b_engine.begin_result(0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
BL::RenderResult::layers_iterator b_single_rlay;
b_rr.layers.begin(b_single_rlay);
BL::RenderLayer b_rlay = *b_single_rlay;
- b_rlay_name = b_view_layer.name();
- /* Update denoising parameters. */
- session->set_denoising(session_params.denoising);
+ {
+ thread_scoped_lock lock(draw_state_.mutex);
+ b_rlay_name = b_view_layer.name();
- /* Compute render passes and film settings. */
- vector<Pass> passes = sync->sync_render_passes(
- b_scene, b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+ /* Signal that the display pass is to be updated. */
+ draw_state_.last_pass_index = -1;
+ }
- /* Set buffer params, using film settings from sync_render_passes. */
- buffer_params.passes = passes;
- buffer_params.denoising_data_pass = scene->film->get_denoising_data_pass();
- buffer_params.denoising_clean_pass = scene->film->get_denoising_clean_pass();
- buffer_params.denoising_prefiltered_pass = scene->film->get_denoising_prefiltered_pass();
+ /* Compute render passes and film settings. */
+ sync->sync_render_passes(b_rlay, b_view_layer);
BL::RenderResult::views_iterator b_view_iter;
@@ -520,6 +490,9 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
++b_view_iter, ++view_index) {
b_rview_name = b_view_iter->name();
+ buffer_params.layer = b_view_layer.name();
+ buffer_params.view = b_rview_name;
+
/* set the current view */
b_engine.active_view_set(b_rview_name.c_str());
@@ -549,20 +522,16 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
}
/* Update number of samples per layer. */
- int samples = sync->get_layer_samples();
- bool bound_samples = sync->get_layer_bound_samples();
- int effective_layer_samples;
+ const int samples = sync->get_layer_samples();
+ const bool bound_samples = sync->get_layer_bound_samples();
- if (samples != 0 && (!bound_samples || (samples < session_params.samples)))
- effective_layer_samples = samples;
- else
- effective_layer_samples = session_params.samples;
-
- /* Update tile manager if we're doing resumable render. */
- update_resumable_tile_manager(effective_layer_samples);
+ SessionParams effective_session_params = session_params;
+ if (samples != 0 && (!bound_samples || (samples < session_params.samples))) {
+ effective_session_params.samples = samples;
+ }
/* Update session itself. */
- session->reset(buffer_params, effective_layer_samples);
+ session->reset(effective_session_params, buffer_params);
/* render */
if (!b_engine.is_preview() && background && print_render_stats) {
@@ -586,65 +555,146 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
stamp_view_layer_metadata(scene, b_rlay_name);
/* free result without merging */
- end_render_result(b_engine, b_rr, true, true, false);
+ b_engine.end_result(b_rr, true, false, false);
double total_time, render_time;
session->progress.get_time(total_time, render_time);
VLOG(1) << "Total render time: " << total_time;
VLOG(1) << "Render time (without synchronization): " << render_time;
+}
+
+void BlenderSession::render_frame_finish()
+{
+ /* Processing of all layers and views is done. Clear the strings so that we can communicate
+ * progress about reading files and denoising them. */
+ b_rlay_name = "";
+ b_rview_name = "";
+
+ if (!b_render.use_persistent_data()) {
+ /* Free the sync object so that it can properly dereference nodes from the scene graph before
+ * the graph is freed. */
+ delete sync;
+ sync = nullptr;
+
+ session->device_free();
+ }
+
+ for (string_view filename : full_buffer_files_) {
+ session->process_full_buffer_from_disk(filename);
+ path_remove(filename);
+ }
/* clear callback */
session->write_render_tile_cb = function_null;
session->update_render_tile_cb = function_null;
+ session->full_buffer_written_cb = function_null;
}
-static int bake_pass_filter_get(const int pass_filter)
+static PassType bake_type_to_pass(const string &bake_type_str, const int bake_filter)
{
- int flag = BAKE_FILTER_NONE;
-
- if ((pass_filter & BL::BakeSettings::pass_filter_DIRECT) != 0)
- flag |= BAKE_FILTER_DIRECT;
- if ((pass_filter & BL::BakeSettings::pass_filter_INDIRECT) != 0)
- flag |= BAKE_FILTER_INDIRECT;
- if ((pass_filter & BL::BakeSettings::pass_filter_COLOR) != 0)
- flag |= BAKE_FILTER_COLOR;
-
- if ((pass_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0)
- flag |= BAKE_FILTER_DIFFUSE;
- if ((pass_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0)
- flag |= BAKE_FILTER_GLOSSY;
- if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
- flag |= BAKE_FILTER_TRANSMISSION;
-
- if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
- flag |= BAKE_FILTER_EMISSION;
- if ((pass_filter & BL::BakeSettings::pass_filter_AO) != 0)
- flag |= BAKE_FILTER_AO;
-
- return flag;
+ const char *bake_type = bake_type_str.c_str();
+
+ /* data passes */
+ if (strcmp(bake_type, "POSITION") == 0) {
+ return PASS_POSITION;
+ }
+ else if (strcmp(bake_type, "NORMAL") == 0) {
+ return PASS_NORMAL;
+ }
+ else if (strcmp(bake_type, "UV") == 0) {
+ return PASS_UV;
+ }
+ else if (strcmp(bake_type, "ROUGHNESS") == 0) {
+ return PASS_ROUGHNESS;
+ }
+ else if (strcmp(bake_type, "EMIT") == 0) {
+ return PASS_EMISSION;
+ }
+ /* light passes */
+ else if (strcmp(bake_type, "AO") == 0) {
+ return PASS_AO;
+ }
+ else if (strcmp(bake_type, "COMBINED") == 0) {
+ return PASS_COMBINED;
+ }
+ else if (strcmp(bake_type, "SHADOW") == 0) {
+ return PASS_SHADOW;
+ }
+ else if (strcmp(bake_type, "DIFFUSE") == 0) {
+ if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+ bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_DIFFUSE;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+ return PASS_DIFFUSE_DIRECT;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_DIFFUSE_INDIRECT;
+ }
+ else {
+ return PASS_DIFFUSE_COLOR;
+ }
+ }
+ else if (strcmp(bake_type, "GLOSSY") == 0) {
+ if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+ bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_GLOSSY;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+ return PASS_GLOSSY_DIRECT;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_GLOSSY_INDIRECT;
+ }
+ else {
+ return PASS_GLOSSY_COLOR;
+ }
+ }
+ else if (strcmp(bake_type, "TRANSMISSION") == 0) {
+ if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+ bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_TRANSMISSION;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+ return PASS_TRANSMISSION_DIRECT;
+ }
+ else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+ return PASS_TRANSMISSION_INDIRECT;
+ }
+ else {
+ return PASS_TRANSMISSION_COLOR;
+ }
+ }
+ /* extra */
+ else if (strcmp(bake_type, "ENVIRONMENT") == 0) {
+ return PASS_BACKGROUND;
+ }
+
+ return PASS_COMBINED;
}
void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
BL::Object &b_object,
- const string &pass_type,
- const int pass_filter,
+ const string &bake_type,
+ const int bake_filter,
const int bake_width,
const int bake_height)
{
b_depsgraph = b_depsgraph_;
- ShaderEvalType shader_type = get_shader_type(pass_type);
- int bake_pass_filter = bake_pass_filter_get(pass_filter);
-
/* Initialize bake manager, before we load the baking kernels. */
- scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter);
+ scene->bake_manager->set(scene, b_object.name());
- /* Passes are identified by name, so in order to return the combined pass we need to set the
- * name. */
- Pass::add(PASS_COMBINED, scene->passes, "Combined");
+ /* Add render pass that we want to bake, and name it Combined so that it is
+ * used as that on the Blender side. */
+ Pass *pass = scene->create_node<Pass>();
+ pass->set_name(ustring("Combined"));
+ pass->set_type(bake_type_to_pass(bake_type, bake_filter));
+ pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));
- session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1);
- session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
+ session->read_render_tile_cb = [&]() { read_render_tile(); };
+ session->write_render_tile_cb = [&]() { write_render_tile(); };
+ session->set_gpu_display(nullptr);
if (!session->progress.get_cancel()) {
/* Sync scene. */
@@ -667,18 +717,15 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
if (object_found && !session->progress.get_cancel()) {
/* Get session and buffer parameters. */
- SessionParams session_params = BlenderSync::get_session_params(
+ const SessionParams session_params = BlenderSync::get_session_params(
b_engine, b_userpref, b_scene, background);
- session_params.progressive_refine = false;
BufferParams buffer_params;
buffer_params.width = bake_width;
buffer_params.height = bake_height;
- buffer_params.passes = scene->passes;
/* Update session. */
- session->tile_manager.set_samples(session_params.samples);
- session->reset(buffer_params, session_params.samples);
+ session->reset(session_params, buffer_params);
session->progress.set_update_callback(
function_bind(&BlenderSession::update_bake_progress, this));
@@ -690,71 +737,43 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
session->wait();
}
- session->read_bake_tile_cb = function_null;
+ session->read_render_tile_cb = function_null;
session->write_render_tile_cb = function_null;
}
-void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay,
- RenderTile &rtile,
- bool do_update_only)
+void BlenderSession::write_render_result(BL::RenderLayer &b_rlay)
{
- RenderBuffers *buffers = rtile.buffers;
-
- /* copy data from device */
- if (!buffers->copy_from_device())
+ if (!session->copy_render_tile_from_device()) {
return;
-
- float exposure = scene->film->get_exposure();
-
- vector<float> pixels(rtile.w * rtile.h * 4);
-
- /* Adjust absolute sample number to the range. */
- int sample = rtile.sample;
- const int range_start_sample = session->tile_manager.range_start_sample;
- if (range_start_sample != -1) {
- sample -= range_start_sample;
}
- if (!do_update_only) {
- /* copy each pass */
- for (BL::RenderPass &b_pass : b_rlay.passes) {
- int components = b_pass.channels();
-
- /* Copy pixels from regular render passes. */
- bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]);
+ const int2 tile_size = session->get_render_tile_size();
+ vector<float> pixels(tile_size.x * tile_size.y * 4);
- /* If denoising pass, */
- if (!read) {
- int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
- if (denoising_offset >= 0) {
- read = buffers->get_denoising_pass_rect(
- denoising_offset, exposure, sample, components, &pixels[0]);
- }
- }
-
- if (!read) {
- memset(&pixels[0], 0, pixels.size() * sizeof(float));
- }
-
- b_pass.rect(&pixels[0]);
+ /* Copy each pass. */
+ for (BL::RenderPass &b_pass : b_rlay.passes) {
+ if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
+ memset(&pixels[0], 0, pixels.size() * sizeof(float));
}
- }
- else {
- /* copy combined pass */
- BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
- if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0]))
- b_combined_pass.rect(&pixels[0]);
+
+ b_pass.rect(&pixels[0]);
}
}
-void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
+void BlenderSession::update_render_result(BL::RenderLayer &b_rlay)
{
- do_write_update_render_result(b_rlay, rtile, false);
-}
+ if (!session->copy_render_tile_from_device()) {
+ return;
+ }
-void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
-{
- do_write_update_render_result(b_rlay, rtile, true);
+ const int2 tile_size = session->get_render_tile_size();
+ vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+ /* Copy combined pass. */
+ BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
+ if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) {
+ b_combined_pass.rect(&pixels[0]);
+ }
}
void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -764,19 +783,19 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
return;
/* on session/scene parameter changes, we recreate session entirely */
- SessionParams session_params = BlenderSync::get_session_params(
+ const SessionParams session_params = BlenderSync::get_session_params(
b_engine, b_userpref, b_scene, background);
- SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
- bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+ const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+ const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
free_session();
create_session();
}
- /* increase samples, but never decrease */
+ /* increase samples and render time, but never decrease */
session->set_samples(session_params.samples);
- session->set_denoising_start_sample(session_params.denoising.start_sample);
+ session->set_time_limit(session_params.time_limit);
session->set_pause(session_pause);
/* copy recalc flags, outside of mutex so we can decide to do the real
@@ -808,21 +827,12 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
sync->sync_camera(b_render, b_camera_override, width, height, "");
/* get buffer parameters */
- BufferParams buffer_params = BlenderSync::get_buffer_params(
- b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-
- if (!buffer_params.denoising_data_pass) {
- session_params.denoising.use = false;
- }
-
- session->set_denoising(session_params.denoising);
-
- /* Update film if denoising data was enabled or disabled. */
- scene->film->set_denoising_data_pass(buffer_params.denoising_data_pass);
+ const BufferParams buffer_params = BlenderSync::get_buffer_params(
+ b_v3d, b_rv3d, scene->camera, width, height);
/* reset if needed */
if (scene->need_reset()) {
- session->reset(buffer_params, session_params.samples);
+ session->reset(session_params, buffer_params);
/* After session reset, so device is not accessing image data anymore. */
builtin_images_load();
@@ -839,7 +849,44 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
session->start();
}
-bool BlenderSession::draw(int w, int h)
+void BlenderSession::draw(BL::SpaceImageEditor &space_image)
+{
+ if (!session || !session->scene) {
+ /* Offline render drawing does not force the render engine update, which means it's possible
+ * that the Session is not created yet. */
+ return;
+ }
+
+ thread_scoped_lock lock(draw_state_.mutex);
+
+ const int pass_index = space_image.image_user().multilayer_pass();
+ if (pass_index != draw_state_.last_pass_index) {
+ BL::RenderPass b_display_pass(b_engine.pass_by_index_get(b_rlay_name.c_str(), pass_index));
+ if (!b_display_pass) {
+ return;
+ }
+
+ Scene *scene = session->scene;
+
+ thread_scoped_lock lock(scene->mutex);
+
+ const Pass *pass = Pass::find(scene->passes, b_display_pass.name());
+ if (!pass) {
+ return;
+ }
+
+ scene->film->set_display_pass(pass->get_type());
+
+ draw_state_.last_pass_index = pass_index;
+ }
+
+ BL::Array<float, 2> zoom = space_image.zoom();
+ gpu_display_->set_zoom(zoom[0], zoom[1]);
+
+ session->draw();
+}
+
+void BlenderSession::view_draw(int w, int h)
{
/* pause in redraw in case update is not being called due to final render */
session->set_pause(BlenderSync::get_session_pause(b_scene, background));
@@ -885,14 +932,14 @@ bool BlenderSession::draw(int w, int h)
/* reset if requested */
if (reset) {
- SessionParams session_params = BlenderSync::get_session_params(
+ const SessionParams session_params = BlenderSync::get_session_params(
b_engine, b_userpref, b_scene, background);
- BufferParams buffer_params = BlenderSync::get_buffer_params(
- b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
- bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+ const BufferParams buffer_params = BlenderSync::get_buffer_params(
+ b_v3d, b_rv3d, scene->camera, width, height);
+ const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
if (session_pause == false) {
- session->reset(buffer_params, session_params.samples);
+ session->reset(session_params, buffer_params);
start_resize_time = 0.0;
}
}
@@ -905,18 +952,7 @@ bool BlenderSession::draw(int w, int h)
update_status_progress();
/* draw */
- BufferParams buffer_params = BlenderSync::get_buffer_params(
- b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
- DeviceDrawParams draw_params;
-
- if (session->params.display_buffer_linear) {
- draw_params.bind_display_space_shader_cb = function_bind(
- &BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene);
- draw_params.unbind_display_space_shader_cb = function_bind(
- &BL::RenderEngine::unbind_display_space_shader, &b_engine);
- }
-
- return !session->draw(buffer_params, draw_params);
+ session->draw();
}
void BlenderSession::get_status(string &status, string &substatus)
@@ -924,11 +960,6 @@ void BlenderSession::get_status(string &status, string &substatus)
session->progress.get_status(status, substatus);
}
-void BlenderSession::get_kernel_status(string &kernel_status)
-{
- session->progress.get_kernel_status(kernel_status);
-}
-
void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
{
session->progress.get_time(total_time, render_time);
@@ -947,7 +978,7 @@ void BlenderSession::update_bake_progress()
void BlenderSession::update_status_progress()
{
- string timestatus, status, substatus, kernel_status;
+ string timestatus, status, substatus;
string scene_status = "";
float progress;
double total_time, remaining_time = 0, render_time;
@@ -955,7 +986,6 @@ void BlenderSession::update_status_progress()
float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
get_status(status, substatus);
- get_kernel_status(kernel_status);
get_progress(progress, total_time, render_time);
if (progress > 0)
@@ -980,14 +1010,12 @@ void BlenderSession::update_status_progress()
status = " | " + status;
if (substatus.size() > 0)
status += " | " + substatus;
- if (kernel_status.size() > 0)
- status += " | " + kernel_status;
}
double current_time = time_dt();
- /* When rendering in a window, redraw the status at least once per second to keep the elapsed and
- * remaining time up-to-date. For headless rendering, only report when something significant
- * changes to keep the console output readable. */
+ /* When rendering in a window, redraw the status at least once per second to keep the elapsed
+ * and remaining time up-to-date. For headless rendering, only report when something
+ * significant changes to keep the console output readable. */
if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
b_engine.update_stats("", (timestatus + scene_status + status).c_str());
b_engine.update_memory_stats(mem_used, mem_peak);
@@ -1048,56 +1076,6 @@ void BlenderSession::test_cancel()
session->progress.set_cancel("Cancelled");
}
-void BlenderSession::update_resumable_tile_manager(int num_samples)
-{
- const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
- current_resumable_chunk = BlenderSession::current_resumable_chunk;
- if (num_resumable_chunks == 0) {
- return;
- }
-
- if (num_resumable_chunks > num_samples) {
- fprintf(stderr,
- "Cycles warning: more sample chunks (%d) than samples (%d), "
- "this will cause some samples to be included in multiple chunks.\n",
- num_resumable_chunks,
- num_samples);
- }
-
- const float num_samples_per_chunk = (float)num_samples / num_resumable_chunks;
-
- float range_start_sample, range_num_samples;
- if (current_resumable_chunk != 0) {
- /* Single chunk rendering. */
- range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
- range_num_samples = num_samples_per_chunk;
- }
- else {
- /* Ranged-chunks. */
- const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
- range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
- range_num_samples = num_chunks * num_samples_per_chunk;
- }
-
- /* Round after doing the multiplications with num_chunks and num_samples_per_chunk
- * to allow for many small chunks. */
- int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f);
- int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1);
-
- /* Make sure we don't overshoot. */
- if (rounded_range_start_sample + rounded_range_num_samples > num_samples) {
- rounded_range_num_samples = num_samples - rounded_range_num_samples;
- }
-
- VLOG(1) << "Samples range start is " << range_start_sample << ", "
- << "number of samples to render is " << range_num_samples;
-
- scene->integrator->set_start_sample(rounded_range_start_sample);
-
- session->tile_manager.range_start_sample = rounded_range_start_sample;
- session->tile_manager.range_num_samples = rounded_range_num_samples;
-}
-
void BlenderSession::free_blender_memory_if_possible()
{
if (!background) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index d967b81c854..11e2657a325 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -29,12 +29,11 @@
CCL_NAMESPACE_BEGIN
+class BlenderGPUDisplay;
class BlenderSync;
class ImageMetaData;
class Scene;
class Session;
-class RenderBuffers;
-class RenderTile;
class BlenderSession {
public:
@@ -62,6 +61,8 @@ class BlenderSession {
/* offline render */
void render(BL::Depsgraph &b_depsgraph);
+ void render_frame_finish();
+
void bake(BL::Depsgraph &b_depsgrah,
BL::Object &b_object,
const string &pass_type,
@@ -69,24 +70,29 @@ class BlenderSession {
const int bake_width,
const int bake_height);
- void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
- void write_render_tile(RenderTile &rtile);
- void read_render_tile(RenderTile &rtile);
+ void write_render_result(BL::RenderLayer &b_rlay);
+ void write_render_tile();
+
+ void update_render_tile();
+
+ void full_buffer_written(string_view filename);
/* update functions are used to update display buffer only after sample was rendered
* only needed for better visual feedback */
- void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
- void update_render_tile(RenderTile &rtile, bool highlight);
+ void update_render_result(BL::RenderLayer &b_rlay);
+
+ /* read functions for baking input */
+ void read_render_tile();
/* interactive updates */
void synchronize(BL::Depsgraph &b_depsgraph);
/* drawing */
- bool draw(int w, int h);
+ void draw(BL::SpaceImageEditor &space_image);
+ void view_draw(int w, int h);
void tag_redraw();
void tag_update();
void get_status(string &status, string &substatus);
- void get_kernel_status(string &kernel_status);
void get_progress(float &progress, double &total_time, double &render_time);
void test_cancel();
void update_status_progress();
@@ -123,6 +129,8 @@ class BlenderSession {
void *python_thread_state;
+ bool use_developer_ui;
+
/* Global state which is common for all render sessions created from Blender.
* Usually denotes command line arguments.
*/
@@ -134,41 +142,28 @@ class BlenderSession {
*/
static bool headless;
- /* ** Resumable render ** */
-
- /* Overall number of chunks in which the sample range is to be divided. */
- static int num_resumable_chunks;
-
- /* Current resumable chunk index to render. */
- static int current_resumable_chunk;
-
- /* Alternative to single-chunk rendering to render a range of chunks. */
- static int start_resumable_chunk;
- static int end_resumable_chunk;
-
static bool print_render_stats;
protected:
void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
- void do_write_update_render_result(BL::RenderLayer &b_rlay,
- RenderTile &rtile,
- bool do_update_only);
- void do_write_update_render_tile(RenderTile &rtile,
- bool do_update_only,
- bool do_read_only,
- bool highlight);
-
void builtin_images_load();
- /* Update tile manager to reflect resumable render settings. */
- void update_resumable_tile_manager(int num_samples);
-
/* Is used after each render layer synchronization is done with the goal
* of freeing render engine data which is held from Blender side (for
* example, dependency graph).
*/
void free_blender_memory_if_possible();
+
+ struct {
+ thread_mutex mutex;
+ int last_pass_index = -1;
+ } draw_state_;
+
+ /* NOTE: The BlenderSession references the GPU display. */
+ BlenderGPUDisplay *gpu_display_ = nullptr;
+
+ vector<string> full_buffer_files_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index de7b2761d00..8c4f789ffd0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -17,6 +17,7 @@
#include "render/background.h"
#include "render/colorspace.h"
#include "render/graph.h"
+#include "render/integrator.h"
#include "render/light.h"
#include "render/nodes.h"
#include "render/osl.h"
@@ -475,17 +476,11 @@ static ShaderNode *add_node(Scene *scene,
SubsurfaceScatteringNode *subsurface = graph->create_node<SubsurfaceScatteringNode>();
switch (b_subsurface_node.falloff()) {
- case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
- subsurface->set_falloff(CLOSURE_BSSRDF_CUBIC_ID);
- break;
- case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN:
- subsurface->set_falloff(CLOSURE_BSSRDF_GAUSSIAN_ID);
- break;
- case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
- subsurface->set_falloff(CLOSURE_BSSRDF_BURLEY_ID);
+ case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK_FIXED_RADIUS:
+ subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
break;
case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK:
- subsurface->set_falloff(CLOSURE_BSSRDF_RANDOM_WALK_ID);
+ subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
break;
}
@@ -597,11 +592,11 @@ static ShaderNode *add_node(Scene *scene,
break;
}
switch (b_principled_node.subsurface_method()) {
- case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY:
- principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_ID);
+ case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK_FIXED_RADIUS:
+ principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
break;
case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK:
- principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+ principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
break;
}
node = principled;
@@ -1360,10 +1355,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all)
{
Background *background = scene->background;
+ Integrator *integrator = scene->integrator;
BL::World b_world = b_scene.world();
- BlenderViewportParameters new_viewport_parameters(b_v3d);
+ BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
if (world_recalc || update_all || b_world.ptr.data != world_map ||
viewport_parameters.shader_modified(new_viewport_parameters)) {
@@ -1455,9 +1451,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
/* AO */
BL::WorldLighting b_light = b_world.light_settings();
- background->set_use_ao(b_light.use_ambient_occlusion());
- background->set_ao_factor(b_light.ao_factor());
- background->set_ao_distance(b_light.distance());
+ integrator->set_ao_factor(b_light.ao_factor());
+ integrator->set_ao_distance(b_light.distance());
/* visibility */
PointerRNA cvisibility = RNA_pointer_get(&b_world.ptr, "cycles_visibility");
@@ -1472,9 +1467,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
background->set_visibility(visibility);
}
else {
- background->set_use_ao(false);
- background->set_ao_factor(0.0f);
- background->set_ao_distance(FLT_MAX);
+ integrator->set_ao_factor(1.0f);
+ integrator->set_ao_distance(10.0f);
}
shader->set_graph(graph);
@@ -1496,7 +1490,6 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
background->set_use_shader(view_layer.use_background_shader ||
viewport_parameters.use_custom_shader());
- background->set_use_ao(background->get_use_ao() && view_layer.use_background_ao);
background->tag_update(scene);
}
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 26d64b7bf85..d6fc7ee1723 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -53,6 +53,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
BL::Scene &b_scene,
Scene *scene,
bool preview,
+ bool use_developer_ui,
Progress &progress)
: b_engine(b_engine),
b_data(b_data),
@@ -68,6 +69,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
scene(scene),
preview(preview),
experimental(false),
+ use_developer_ui(use_developer_ui),
dicing_rate(1.0f),
max_subdivisions(12),
progress(progress),
@@ -224,7 +226,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
}
if (b_v3d) {
- BlenderViewportParameters new_viewport_parameters(b_v3d);
+ BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
if (viewport_parameters.shader_modified(new_viewport_parameters)) {
world_recalc = true;
@@ -251,9 +253,13 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
+ /* TODO(sergey): This feels weak to pass view layer to the integrator, and even weaker to have an
+ * implicit check on whether it is a background render or not. What is the nicer thing here? */
+ const bool background = !b_v3d;
+
sync_view_layer(b_view_layer);
- sync_integrator();
- sync_film(b_v3d);
+ sync_integrator(b_view_layer, background);
+ sync_film(b_view_layer, b_v3d);
sync_shaders(b_depsgraph, b_v3d);
sync_images();
@@ -280,7 +286,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
/* Integrator */
-void BlenderSync::sync_integrator()
+void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
{
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -328,59 +334,24 @@ void BlenderSync::sync_integrator()
integrator->set_motion_blur(view_layer.use_motion_blur);
}
- integrator->set_method((Integrator::Method)get_enum(
- cscene, "progressive", Integrator::NUM_METHODS, Integrator::PATH));
-
- integrator->set_sample_all_lights_direct(get_boolean(cscene, "sample_all_lights_direct"));
- integrator->set_sample_all_lights_indirect(get_boolean(cscene, "sample_all_lights_indirect"));
integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
SamplingPattern sampling_pattern = (SamplingPattern)get_enum(
cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_SOBOL);
-
- int adaptive_min_samples = INT_MAX;
-
- if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
- sampling_pattern = SAMPLING_PATTERN_PMJ;
- adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
- integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
- }
- else {
- integrator->set_adaptive_threshold(0.0f);
- }
-
integrator->set_sampling_pattern(sampling_pattern);
- int diffuse_samples = get_int(cscene, "diffuse_samples");
- int glossy_samples = get_int(cscene, "glossy_samples");
- int transmission_samples = get_int(cscene, "transmission_samples");
- int ao_samples = get_int(cscene, "ao_samples");
- int mesh_light_samples = get_int(cscene, "mesh_light_samples");
- int subsurface_samples = get_int(cscene, "subsurface_samples");
- int volume_samples = get_int(cscene, "volume_samples");
-
- if (get_boolean(cscene, "use_square_samples")) {
- integrator->set_diffuse_samples(diffuse_samples * diffuse_samples);
- integrator->set_glossy_samples(glossy_samples * glossy_samples);
- integrator->set_transmission_samples(transmission_samples * transmission_samples);
- integrator->set_ao_samples(ao_samples * ao_samples);
- integrator->set_mesh_light_samples(mesh_light_samples * mesh_light_samples);
- integrator->set_subsurface_samples(subsurface_samples * subsurface_samples);
- integrator->set_volume_samples(volume_samples * volume_samples);
- adaptive_min_samples = min(adaptive_min_samples * adaptive_min_samples, INT_MAX);
+ if (preview) {
+ integrator->set_use_adaptive_sampling(
+ RNA_boolean_get(&cscene, "use_preview_adaptive_sampling"));
+ integrator->set_adaptive_threshold(get_float(cscene, "preview_adaptive_threshold"));
+ integrator->set_adaptive_min_samples(get_int(cscene, "preview_adaptive_min_samples"));
}
else {
- integrator->set_diffuse_samples(diffuse_samples);
- integrator->set_glossy_samples(glossy_samples);
- integrator->set_transmission_samples(transmission_samples);
- integrator->set_ao_samples(ao_samples);
- integrator->set_mesh_light_samples(mesh_light_samples);
- integrator->set_subsurface_samples(subsurface_samples);
- integrator->set_volume_samples(volume_samples);
+ integrator->set_use_adaptive_sampling(RNA_boolean_get(&cscene, "use_adaptive_sampling"));
+ integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
+ integrator->set_adaptive_min_samples(get_int(cscene, "adaptive_min_samples"));
}
- integrator->set_adaptive_min_samples(adaptive_min_samples);
-
if (get_boolean(cscene, "use_fast_gi")) {
if (preview) {
integrator->set_ao_bounces(get_int(cscene, "ao_bounces"));
@@ -393,20 +364,38 @@ void BlenderSync::sync_integrator()
integrator->set_ao_bounces(0);
}
- /* UPDATE_NONE as we don't want to tag the integrator as modified, just tag dependent things */
+ const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
+ integrator->set_use_denoise(denoise_params.use);
+
+ /* Only update denoiser parameters if the denoiser is actually used. This allows to tweak
+ * denoiser parameters before enabling it without render resetting on every change. The downside
+ * is that the interface and the integrator are technically out of sync. */
+ if (denoise_params.use) {
+ integrator->set_denoiser_type(denoise_params.type);
+ integrator->set_denoise_start_sample(denoise_params.start_sample);
+ integrator->set_use_denoise_pass_albedo(denoise_params.use_pass_albedo);
+ integrator->set_use_denoise_pass_normal(denoise_params.use_pass_normal);
+ integrator->set_denoiser_prefilter(denoise_params.prefilter);
+ }
+
+ /* UPDATE_NONE as we don't want to tag the integrator as modified (this was done by the
+ * set calls above), but we need to make sure that the dependent things are tagged. */
integrator->tag_update(scene, Integrator::UPDATE_NONE);
}
/* Film */
-void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
+void BlenderSync::sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d)
{
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+ PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
Film *film = scene->film;
if (b_v3d) {
- film->set_display_pass(update_viewport_display_passes(b_v3d, scene->passes));
+ const BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
+ film->set_display_pass(new_viewport_parameters.display_pass);
+ film->set_show_active_pixels(new_viewport_parameters.show_active_pixels);
}
film->set_exposure(get_float(cscene, "film_exposure"));
@@ -434,6 +423,15 @@ void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
break;
}
}
+
+ /* Blender viewport does not support proper shadow catcher compositing, so force an approximate
+ * mode to improve visual feedback. */
+ if (b_v3d) {
+ film->set_use_approximate_shadow_catcher(true);
+ }
+ else {
+ film->set_use_approximate_shadow_catcher(!get_boolean(crl, "use_pass_shadow_catcher"));
+ }
}
/* Render Layer */
@@ -444,7 +442,6 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
/* Filter. */
view_layer.use_background_shader = b_view_layer.use_sky();
- view_layer.use_background_ao = b_view_layer.use_ao();
/* Always enable surfaces for baking, otherwise there is nothing to bake to. */
view_layer.use_surfaces = b_view_layer.use_solid() || scene->bake_manager->get_baking();
view_layer.use_hair = b_view_layer.use_strand();
@@ -464,10 +461,7 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
if (use_layer_samples != 2) {
int samples = b_view_layer.samples();
- if (get_boolean(cscene, "use_square_samples"))
- view_layer.samples = samples * samples;
- else
- view_layer.samples = samples;
+ view_layer.samples = samples;
}
}
@@ -499,7 +493,8 @@ void BlenderSync::sync_images()
}
/* Passes */
-PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
+
+static PassType get_blender_pass_type(BL::RenderPass &b_pass)
{
string name = b_pass.name();
#define MAP_PASS(passname, passtype) \
@@ -507,10 +502,15 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
return passtype; \
} \
((void)0)
+
/* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+
MAP_PASS("Combined", PASS_COMBINED);
+ MAP_PASS("Noisy Image", PASS_COMBINED);
+
MAP_PASS("Depth", PASS_DEPTH);
MAP_PASS("Mist", PASS_MIST);
+ MAP_PASS("Position", PASS_POSITION);
MAP_PASS("Normal", PASS_NORMAL);
MAP_PASS("IndexOB", PASS_OBJECT_ID);
MAP_PASS("UV", PASS_UV);
@@ -539,118 +539,92 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
+ MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
+ MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
+
+ MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
+ MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+
MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+
MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+
if (string_startswith(name, cryptomatte_prefix)) {
return PASS_CRYPTOMATTE;
}
+
#undef MAP_PASS
return PASS_NONE;
}
-int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
+static Pass *pass_add(Scene *scene,
+ PassType type,
+ const char *name,
+ PassMode mode = PassMode::DENOISED)
{
- string name = b_pass.name();
+ Pass *pass = scene->create_node<Pass>();
- if (name == "Noisy Image")
- return DENOISING_PASS_PREFILTERED_COLOR;
+ pass->set_type(type);
+ pass->set_name(ustring(name));
+ pass->set_mode(mode);
- if (name.substr(0, 10) != "Denoising ") {
- return -1;
- }
- name = name.substr(10);
-
-#define MAP_PASS(passname, offset) \
- if (name == passname) { \
- return offset; \
- } \
- ((void)0)
- MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
- MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
- MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
- MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
- MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
- MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
- MAP_PASS("Clean", DENOISING_PASS_CLEAN);
-#undef MAP_PASS
-
- return -1;
+ return pass;
}
-vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
- BL::RenderLayer &b_rlay,
- BL::ViewLayer &b_view_layer,
- bool adaptive_sampling,
- const DenoiseParams &denoising)
+void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
{
- vector<Pass> passes;
+ PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+ /* Delete all existing passes. */
+ set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
+ scene->delete_nodes(clear_passes);
- /* loop over passes */
+ /* Always add combined pass. */
+ pass_add(scene, PASS_COMBINED, "Combined");
+
+ /* Blender built-in data and light passes. */
for (BL::RenderPass &b_pass : b_rlay.passes) {
- PassType pass_type = get_pass_type(b_pass);
+ const PassType pass_type = get_blender_pass_type(b_pass);
+
+ if (pass_type == PASS_NONE) {
+ LOG(ERROR) << "Unknown pass " << b_pass.name();
+ continue;
+ }
if (pass_type == PASS_MOTION &&
(b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
continue;
}
- if (pass_type != PASS_NONE)
- Pass::add(pass_type, passes, b_pass.name().c_str());
- }
-
- PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
- int denoising_flags = 0;
- if (denoising.use || denoising.store_passes) {
- if (denoising.type == DENOISER_NLM) {
-#define MAP_OPTION(name, flag) \
- if (!get_boolean(crl, name)) { \
- denoising_flags |= flag; \
- } \
- ((void)0)
- MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
- MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
- MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
- MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
- MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
- MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-#undef MAP_OPTION
- }
- b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+ pass_add(scene, pass_type, b_pass.name().c_str());
}
- scene->film->set_denoising_flags(denoising_flags);
-
- if (denoising.store_passes) {
- b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
- b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
- b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
- if (denoising.type == DENOISER_NLM) {
- b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
- b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
- b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
- }
- if (scene->film->get_denoising_flags() & DENOISING_CLEAN_ALL_PASSES) {
- b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
- }
- }
+ PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
+ /* Debug passes. */
if (get_boolean(crl, "pass_debug_render_time")) {
b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
- Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
+ pass_add(scene, PASS_RENDER_TIME, "Debug Render Time");
}
if (get_boolean(crl, "pass_debug_sample_count")) {
b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
- Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+ pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
}
+
+ /* Cycles specific passes. */
if (get_boolean(crl, "use_pass_volume_direct")) {
b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
- Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
+ pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
}
if (get_boolean(crl, "use_pass_volume_indirect")) {
b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
- Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
+ pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
+ }
+ if (get_boolean(crl, "use_pass_shadow_catcher")) {
+ b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+ pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
}
/* Cryptomatte stores two ID/weight pairs per RGBA layer.
@@ -662,7 +636,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
for (int i = 0; i < crypto_depth; i++) {
string passname = cryptomatte_prefix + string_printf("Object%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
- Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+ pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
}
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
}
@@ -670,7 +644,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
for (int i = 0; i < crypto_depth; i++) {
string passname = cryptomatte_prefix + string_printf("Material%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
- Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+ pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
}
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
}
@@ -678,22 +652,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
for (int i = 0; i < crypto_depth; i++) {
string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
- Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+ pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
}
cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
}
- if (b_view_layer.use_pass_cryptomatte_accurate() && cryptomatte_passes != CRYPT_NONE) {
- cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ACCURATE);
- }
scene->film->set_cryptomatte_passes(cryptomatte_passes);
- if (adaptive_sampling) {
- Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
- if (!get_boolean(crl, "pass_debug_sample_count")) {
- Pass::add(PASS_SAMPLE_COUNT, passes);
+ /* Denoising passes. */
+ const bool use_denoising = get_boolean(cscene, "use_denoising") &&
+ get_boolean(crl, "use_denoising");
+ const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
+ if (use_denoising) {
+ b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+ pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
+ if (get_boolean(crl, "use_pass_shadow_catcher")) {
+ b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+ pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
}
}
+ if (store_denoising_passes) {
+ b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
+ pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
+
+ b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
+ pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
+ }
+ /* Custom AOV passes. */
BL::ViewLayer::aovs_iterator b_aov_iter;
for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
BL::AOV b_aov(*b_aov_iter);
@@ -706,28 +691,15 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
if (is_color) {
b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
- Pass::add(PASS_AOV_COLOR, passes, name.c_str());
+ pass_add(scene, PASS_AOV_COLOR, name.c_str());
}
else {
b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
- Pass::add(PASS_AOV_VALUE, passes, name.c_str());
+ pass_add(scene, PASS_AOV_VALUE, name.c_str());
}
}
- scene->film->set_denoising_data_pass(denoising.use || denoising.store_passes);
- scene->film->set_denoising_clean_pass(scene->film->get_denoising_flags() &
- DENOISING_CLEAN_ALL_PASSES);
- scene->film->set_denoising_prefiltered_pass(denoising.store_passes &&
- denoising.type == DENOISER_NLM);
scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
-
- if (!Pass::equals(passes, scene->passes)) {
- scene->film->tag_passes_update(scene, passes);
- scene->film->tag_modified();
- scene->integrator->tag_update(scene, Integrator::UPDATE_ALL);
- }
-
- return passes;
}
void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
@@ -773,9 +745,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
params.shadingsystem = SHADINGSYSTEM_OSL;
if (background || DebugFlags().viewport_static_bvh)
- params.bvh_type = SceneParams::BVH_STATIC;
+ params.bvh_type = BVH_TYPE_STATIC;
else
- params.bvh_type = SceneParams::BVH_DYNAMIC;
+ params.bvh_type = BVH_TYPE_DYNAMIC;
params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -818,8 +790,7 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
BL::Preferences &b_preferences,
BL::Scene &b_scene,
- bool background,
- BL::ViewLayer b_view_layer)
+ bool background)
{
SessionParams params;
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -827,7 +798,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
/* feature set */
params.experimental = (get_enum(cscene, "feature_set") != 0);
- /* Background */
+ /* Headless and background rendering. */
+ params.headless = BlenderSession::headless;
params.background = background;
/* Device */
@@ -836,111 +808,26 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
/* samples */
int samples = get_int(cscene, "samples");
- int aa_samples = get_int(cscene, "aa_samples");
int preview_samples = get_int(cscene, "preview_samples");
- int preview_aa_samples = get_int(cscene, "preview_aa_samples");
- if (get_boolean(cscene, "use_square_samples")) {
- aa_samples = aa_samples * aa_samples;
- preview_aa_samples = preview_aa_samples * preview_aa_samples;
-
- samples = samples * samples;
- preview_samples = preview_samples * preview_samples;
- }
-
- if (get_enum(cscene, "progressive") == 0 && params.device.has_branched_path) {
- if (background) {
- params.samples = aa_samples;
- }
- else {
- params.samples = preview_aa_samples;
- if (params.samples == 0)
- params.samples = INT_MAX;
- }
+ if (background) {
+ params.samples = samples;
}
else {
- if (background) {
- params.samples = samples;
- }
- else {
- params.samples = preview_samples;
- if (params.samples == 0)
- params.samples = INT_MAX;
- }
+ params.samples = preview_samples;
+ if (params.samples == 0)
+ params.samples = INT_MAX;
}
/* Clamp samples. */
params.samples = min(params.samples, Integrator::MAX_SAMPLES);
- /* Adaptive sampling. */
- params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
-
- /* tiles */
- const bool is_cpu = (params.device.type == DEVICE_CPU);
- if (!is_cpu && !background) {
- /* currently GPU could be much slower than CPU when using tiles,
- * still need to be investigated, but meanwhile make it possible
- * to work in viewport smoothly
- */
- int debug_tile_size = get_int(cscene, "debug_tile_size");
-
- params.tile_size = make_int2(debug_tile_size, debug_tile_size);
- }
- else {
- int tile_x = b_engine.tile_x();
- int tile_y = b_engine.tile_y();
-
- params.tile_size = make_int2(tile_x, tile_y);
- }
-
- if ((BlenderSession::headless == false) && background) {
- params.tile_order = (TileOrder)get_enum(cscene, "tile_order");
- }
- else {
- params.tile_order = TILE_BOTTOM_TO_TOP;
- }
-
- /* Denoising */
- params.denoising = get_denoise_params(b_scene, b_view_layer, background);
-
- if (params.denoising.use) {
- /* Add additional denoising devices if we are rendering and denoising
- * with different devices. */
- params.device.add_denoising_devices(params.denoising.type);
-
- /* Check if denoiser is supported by device. */
- if (!(params.device.denoisers & params.denoising.type)) {
- params.denoising.use = false;
- }
- }
-
/* Viewport Performance */
- params.start_resolution = get_int(cscene, "preview_start_resolution");
params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
- /* other parameters */
- params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
- params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
- params.text_timeout = (double)get_float(cscene, "debug_text_timeout");
-
- /* progressive refine */
- BL::RenderSettings b_r = b_scene.render();
- params.progressive_refine = b_engine.is_preview() ||
- get_boolean(cscene, "use_progressive_refine");
- if (b_r.use_save_buffers() || params.adaptive_sampling)
- params.progressive_refine = false;
-
if (background) {
- if (params.progressive_refine)
- params.progressive = true;
- else
- params.progressive = false;
-
- params.start_resolution = INT_MAX;
params.pixel_size = 1;
}
- else
- params.progressive = true;
/* shading system - scene level needs full refresh */
const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
@@ -950,19 +837,30 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
else if (shadingsystem == 1)
params.shadingsystem = SHADINGSYSTEM_OSL;
- /* Color management. */
- params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
-
- if (b_engine.is_preview()) {
- /* For preview rendering we're using same timeout as
- * blender's job update.
- */
- params.progressive_update_timeout = 0.1;
+ /* Time limit. */
+ if (background) {
+ params.time_limit = get_float(cscene, "time_limit");
+ }
+ else {
+ /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
+ * usually higher than acceptable level for the final frame. */
+ /* TODO: It might be useful to support time limit in the viewport as well, but needs some
+ * extra thoughts and input. */
+ params.time_limit = 0.0;
}
+ /* Profiling. */
params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
BlenderSession::print_render_stats;
+ if (background) {
+ params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
+ params.tile_size = get_int(cscene, "tile_size");
+ }
+ else {
+ params.use_auto_tile = false;
+ }
+
return params;
}
@@ -970,33 +868,34 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
BL::ViewLayer &b_view_layer,
bool background)
{
+ enum DenoiserInput {
+ DENOISER_INPUT_RGB = 1,
+ DENOISER_INPUT_RGB_ALBEDO = 2,
+ DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+ DENOISER_INPUT_NUM,
+ };
+
DenoiseParams denoising;
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+ int input_passes = -1;
+
if (background) {
/* Final Render Denoising */
denoising.use = get_boolean(cscene, "use_denoising");
denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+ denoising.prefilter = (DenoiserPrefilter)get_enum(
+ cscene, "denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_NONE);
+
+ input_passes = (DenoiserInput)get_enum(
+ cscene, "denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO_NORMAL);
if (b_view_layer) {
PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
if (!get_boolean(clayer, "use_denoising")) {
denoising.use = false;
}
-
- denoising.radius = get_int(clayer, "denoising_radius");
- denoising.strength = get_float(clayer, "denoising_strength");
- denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
- denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
-
- denoising.input_passes = (DenoiserInput)get_enum(
- clayer,
- (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" :
- "denoising_openimagedenoise_input_passes",
- DENOISER_INPUT_NUM,
- DENOISER_INPUT_RGB_ALBEDO_NORMAL);
-
- denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
}
}
else {
@@ -1004,10 +903,12 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
denoising.use = get_boolean(cscene, "use_preview_denoising");
denoising.type = (DenoiserType)get_enum(
cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+ denoising.prefilter = (DenoiserPrefilter)get_enum(
+ cscene, "preview_denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_FAST);
denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
- denoising.input_passes = (DenoiserInput)get_enum(
- cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, (int)denoising.input_passes);
+ input_passes = (DenoiserInput)get_enum(
+ cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO);
/* Auto select fastest denoiser. */
if (denoising.type == DENOISER_NONE) {
@@ -1023,6 +924,27 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
}
}
+ switch (input_passes) {
+ case DENOISER_INPUT_RGB:
+ denoising.use_pass_albedo = false;
+ denoising.use_pass_normal = false;
+ break;
+
+ case DENOISER_INPUT_RGB_ALBEDO:
+ denoising.use_pass_albedo = true;
+ denoising.use_pass_normal = false;
+ break;
+
+ case DENOISER_INPUT_RGB_ALBEDO_NORMAL:
+ denoising.use_pass_albedo = true;
+ denoising.use_pass_normal = true;
+ break;
+
+ default:
+ LOG(ERROR) << "Unhandled input passes enum " << input_passes;
+ break;
+ }
+
return denoising;
}
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index d25c0ce1bc3..786479ac0f8 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -60,6 +60,7 @@ class BlenderSync {
BL::Scene &b_scene,
Scene *scene,
bool preview,
+ bool use_developer_ui,
Progress &progress);
~BlenderSync();
@@ -75,12 +76,8 @@ class BlenderSync {
int height,
void **python_thread_state);
void sync_view_layer(BL::ViewLayer &b_view_layer);
- vector<Pass> sync_render_passes(BL::Scene &b_scene,
- BL::RenderLayer &b_render_layer,
- BL::ViewLayer &b_view_layer,
- bool adaptive_sampling,
- const DenoiseParams &denoising);
- void sync_integrator();
+ void sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+ void sync_integrator(BL::ViewLayer &b_view_layer, bool background);
void sync_camera(BL::RenderSettings &b_render,
BL::Object &b_override,
int width,
@@ -98,22 +95,13 @@ class BlenderSync {
/* get parameters */
static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
- static SessionParams get_session_params(
- BL::RenderEngine &b_engine,
- BL::Preferences &b_userpref,
- BL::Scene &b_scene,
- bool background,
- BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
+ static SessionParams get_session_params(BL::RenderEngine &b_engine,
+ BL::Preferences &b_userpref,
+ BL::Scene &b_scene,
+ bool background);
static bool get_session_pause(BL::Scene &b_scene, bool background);
- static BufferParams get_buffer_params(BL::SpaceView3D &b_v3d,
- BL::RegionView3D &b_rv3d,
- Camera *cam,
- int width,
- int height,
- const bool use_denoiser);
-
- static PassType get_pass_type(BL::RenderPass &b_pass);
- static int get_denoising_pass(BL::RenderPass &b_pass);
+ static BufferParams get_buffer_params(
+ BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);
private:
static DenoiseParams get_denoise_params(BL::Scene &b_scene,
@@ -131,7 +119,7 @@ class BlenderSync {
int width,
int height,
void **python_thread_state);
- void sync_film(BL::SpaceView3D &b_v3d);
+ void sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d);
void sync_view();
/* Shader */
@@ -245,6 +233,7 @@ class BlenderSync {
Scene *scene;
bool preview;
bool experimental;
+ bool use_developer_ui;
float dicing_rate;
int max_subdivisions;
@@ -253,7 +242,6 @@ class BlenderSync {
RenderLayerInfo()
: material_override(PointerRNA_NULL),
use_background_shader(true),
- use_background_ao(true),
use_surfaces(true),
use_hair(true),
use_volumes(true),
@@ -266,7 +254,6 @@ class BlenderSync {
string name;
BL::Material material_override;
bool use_background_shader;
- bool use_background_ao;
bool use_surfaces;
bool use_hair;
bool use_volumes;
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 18bdfc74de0..62e32240bba 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -17,6 +17,8 @@
#include "blender_viewport.h"
#include "blender_util.h"
+#include "render/pass.h"
+#include "util/util_logging.h"
CCL_NAMESPACE_BEGIN
@@ -26,11 +28,12 @@ BlenderViewportParameters::BlenderViewportParameters()
studiolight_rotate_z(0.0f),
studiolight_intensity(1.0f),
studiolight_background_alpha(1.0f),
- display_pass(PASS_COMBINED)
+ display_pass(PASS_COMBINED),
+ show_active_pixels(false)
{
}
-BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
+BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui)
: BlenderViewportParameters()
{
if (!b_v3d) {
@@ -55,7 +58,25 @@ BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
}
/* Film. */
- display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1);
+
+ /* Lookup display pass based on the enum identifier.
+ * This is because integer values of python enum are not aligned with the passes definition in
+ * the kernel. */
+
+ display_pass = PASS_COMBINED;
+
+ const string display_pass_identifier = get_enum_identifier(cshading, "render_pass");
+ if (!display_pass_identifier.empty()) {
+ const ustring pass_type_identifier(string_to_lower(display_pass_identifier));
+ const NodeEnum *pass_type_enum = Pass::get_type_enum();
+ if (pass_type_enum->exists(pass_type_identifier)) {
+ display_pass = static_cast<PassType>((*pass_type_enum)[pass_type_identifier]);
+ }
+ }
+
+ if (use_developer_ui) {
+ show_active_pixels = get_boolean(cshading, "show_active_pixels");
+ }
}
bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters &other) const
@@ -69,7 +90,7 @@ bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters
bool BlenderViewportParameters::film_modified(const BlenderViewportParameters &other) const
{
- return display_pass != other.display_pass;
+ return display_pass != other.display_pass || show_active_pixels != other.show_active_pixels;
}
bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const
@@ -82,18 +103,4 @@ bool BlenderViewportParameters::use_custom_shader() const
return !(use_scene_world && use_scene_lights);
}
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
-{
- if (b_v3d) {
- const BlenderViewportParameters viewport_parameters(b_v3d);
- const PassType display_pass = viewport_parameters.display_pass;
-
- passes.clear();
- Pass::add(display_pass, passes);
-
- return display_pass;
- }
- return PASS_NONE;
-}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index d6518597053..b5adafc30c9 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -39,9 +39,10 @@ class BlenderViewportParameters {
/* Film. */
PassType display_pass;
+ bool show_active_pixels;
BlenderViewportParameters();
- explicit BlenderViewportParameters(BL::SpaceView3D &b_v3d);
+ BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui);
/* Check whether any of shading related settings are different from the given parameters. */
bool shader_modified(const BlenderViewportParameters &other) const;
@@ -57,8 +58,6 @@ class BlenderViewportParameters {
bool use_custom_shader() const;
};
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
-
CCL_NAMESPACE_END
#endif
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 048c2b95e40..d3497f3a8d8 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -832,18 +832,18 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
typedef StackAllocator<256, float2> LeafTimeStackAllocator;
typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
- vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
- vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
- vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
- vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
- vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
+ vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM];
+ vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM];
+ vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM];
+ vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM];
+ vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM];
/* TODO(sergey): In theory we should be able to store references. */
vector<BVHReference, LeafReferenceStackAllocator> object_references;
- uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
+ uint visibility[PRIMITIVE_NUM] = {0};
/* NOTE: Keep initialization in sync with actual number of primitives. */
- BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {
+ BoundBox bounds[PRIMITIVE_NUM] = {
BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
int ob_num = 0;
int num_new_prims = 0;
@@ -877,7 +877,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
* TODO(sergey): With some pointer trickery we can write directly to the
* destination buffers for the non-spatial split BVH.
*/
- BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
+ BVHNode *leaves[PRIMITIVE_NUM + 1] = {NULL};
int num_leaves = 0;
size_t start_index = 0;
vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object;
@@ -888,7 +888,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
if (need_prim_time) {
local_prim_time.resize(num_new_prims);
}
- for (int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
+ for (int i = 0; i < PRIMITIVE_NUM; ++i) {
int num = (int)p_type[i].size();
if (num != 0) {
assert(p_type[i].size() == p_index[i].size());
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 62f543941a9..96852510b63 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -37,10 +37,10 @@
/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
*/
# include "kernel/bvh/bvh_embree.h"
-# include "kernel/kernel_compat_cpu.h"
-# include "kernel/kernel_globals.h"
+# include "kernel/bvh/bvh_util.h"
+# include "kernel/device/cpu/compat.h"
+# include "kernel/device/cpu/globals.h"
# include "kernel/kernel_random.h"
-# include "kernel/split/kernel_split_data_types.h"
# include "render/hair.h"
# include "render/mesh.h"
@@ -73,46 +73,69 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
const RTCRay *ray = (RTCRay *)args->ray;
RTCHit *hit = (RTCHit *)args->hit;
CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
- KernelGlobals *kg = ctx->kg;
+ const KernelGlobals *kg = ctx->kg;
switch (ctx->type) {
case CCLIntersectContext::RAY_SHADOW_ALL: {
- /* Append the intersection to the end of the array. */
- if (ctx->num_hits < ctx->max_hits) {
- Intersection current_isect;
- kernel_embree_convert_hit(kg, ray, hit, &current_isect);
- for (size_t i = 0; i < ctx->max_hits; ++i) {
+ Intersection current_isect;
+ kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+
+ /* If no transparent shadows, all light is blocked. */
+ const int flags = intersection_get_shader_flags(kg, &current_isect);
+ if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->max_hits == 0) {
+ ctx->opaque_hit = true;
+ return;
+ }
+
+ /* Test if we need to record this transparent intersection. */
+ if (ctx->num_hits < ctx->max_hits || ray->tfar < ctx->max_t) {
+ /* Skip already recorded intersections. */
+ int num_recorded_hits = min(ctx->num_hits, ctx->max_hits);
+
+ for (int i = 0; i < num_recorded_hits; ++i) {
if (current_isect.object == ctx->isect_s[i].object &&
current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
/* This intersection was already recorded, skip it. */
*args->valid = 0;
- break;
+ return;
}
}
- Intersection *isect = &ctx->isect_s[ctx->num_hits];
- ++ctx->num_hits;
- *isect = current_isect;
- int prim = kernel_tex_fetch(__prim_index, isect->prim);
- int shader = 0;
- if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
- shader = kernel_tex_fetch(__tri_shader, prim);
- }
- else {
- float4 str = kernel_tex_fetch(__curves, prim);
- shader = __float_as_int(str.z);
- }
- int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags;
- /* If no transparent shadows, all light is blocked. */
- if (flag & (SD_HAS_TRANSPARENT_SHADOW)) {
- /* This tells Embree to continue tracing. */
- *args->valid = 0;
+
+ /* If maximum number of hits was reached, replace the intersection with the
+ * highest distance. We want to find the N closest intersections. */
+ int isect_index = num_recorded_hits;
+ if (num_recorded_hits + 1 >= ctx->max_hits) {
+ float max_t = ctx->isect_s[0].t;
+ int max_recorded_hit = 0;
+
+ for (int i = 1; i < num_recorded_hits; ++i) {
+ if (ctx->isect_s[i].t > max_t) {
+ max_recorded_hit = i;
+ max_t = ctx->isect_s[i].t;
+ }
+ }
+
+ if (num_recorded_hits >= ctx->max_hits) {
+ isect_index = max_recorded_hit;
+ }
+
+ /* Limit the ray distance and stop counting hits beyond this.
+ * TODO: is there some way we can tell Embree to stop intersecting beyond
+ * this distance when max number of hits is reached?. Or maybe it will
+ * become irrelevant if we make max_hits a very high number on the CPU. */
+ ctx->max_t = max(current_isect.t, max_t);
}
+
+ ctx->isect_s[isect_index] = current_isect;
}
- else {
- /* Increase the number of hits beyond ray.max_hits
- * so that the caller can detect this as opaque. */
- ++ctx->num_hits;
- }
+
+ /* Always increase the number of hits, even beyond ray.max_hits so that
+ * the caller can detect this as and consider it opaque, or trace another
+ * ray. */
+ ++ctx->num_hits;
+
+ /* This tells Embree to continue tracing. */
+ *args->valid = 0;
break;
}
case CCLIntersectContext::RAY_LOCAL:
@@ -329,7 +352,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
scene = NULL;
}
- const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC;
+ const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC;
scene = rtcNewScene(rtc_device);
const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2dc10f30363..31b3971c110 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -31,6 +31,27 @@ CCL_NAMESPACE_BEGIN
*/
typedef KernelBVHLayout BVHLayout;
+/* Type of BVH, in terms whether it is supported dynamic updates of meshes
+ * or whether modifying geometry requires full BVH rebuild.
+ */
+enum BVHType {
+ /* BVH supports dynamic updates of geometry.
+ *
+ * Faster for updating BVH tree when doing modifications in viewport,
+ * but slower for rendering.
+ */
+ BVH_TYPE_DYNAMIC = 0,
+ /* BVH tree is calculated for specific scene, updates in geometry
+ * requires full tree rebuild.
+ *
+ * Slower to update BVH tree when modifying objects in viewport, also
+ * slower to build final BVH tree but gives best possible render speed.
+ */
+ BVH_TYPE_STATIC = 1,
+
+ BVH_NUM_TYPES,
+};
+
/* Names bitflag type to denote which BVH layouts are supported by
* particular area.
*
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 04ff598621a..da259171844 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -287,9 +287,6 @@ if(CYCLES_STANDALONE_REPOSITORY)
endif()
set(__boost_packages filesystem regex system thread date_time)
- if(WITH_CYCLES_NETWORK)
- list(APPEND __boost_packages serialization)
- endif()
if(WITH_CYCLES_OSL)
list(APPEND __boost_packages wave)
endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
set(SRC
device.cpp
- device_cpu.cpp
- device_cuda.cpp
- device_denoising.cpp
- device_dummy.cpp
+ device_denoise.cpp
+ device_graphics_interop.cpp
+ device_kernel.cpp
device_memory.cpp
- device_multi.cpp
- device_opencl.cpp
- device_optix.cpp
- device_split_kernel.cpp
- device_task.cpp
+ device_queue.cpp
+)
+
+set(SRC_CPU
+ cpu/device.cpp
+ cpu/device.h
+ cpu/device_impl.cpp
+ cpu/device_impl.h
+ cpu/kernel.cpp
+ cpu/kernel.h
+ cpu/kernel_function.h
+ cpu/kernel_thread_globals.cpp
+ cpu/kernel_thread_globals.h
)
set(SRC_CUDA
- cuda/device_cuda.h
- cuda/device_cuda_impl.cpp
+ cuda/device.cpp
+ cuda/device.h
+ cuda/device_impl.cpp
+ cuda/device_impl.h
+ cuda/graphics_interop.cpp
+ cuda/graphics_interop.h
+ cuda/kernel.cpp
+ cuda/kernel.h
+ cuda/queue.cpp
+ cuda/queue.h
+ cuda/util.cpp
+ cuda/util.h
)
-set(SRC_OPENCL
- opencl/device_opencl.h
- opencl/device_opencl_impl.cpp
- opencl/memory_manager.h
- opencl/memory_manager.cpp
- opencl/opencl_util.cpp
+set(SRC_DUMMY
+ dummy/device.cpp
+ dummy/device.h
)
-if(WITH_CYCLES_NETWORK)
- list(APPEND SRC
- device_network.cpp
- )
-endif()
+set(SRC_MULTI
+ multi/device.cpp
+ multi/device.h
+)
+
+set(SRC_OPTIX
+ optix/device.cpp
+ optix/device.h
+ optix/device_impl.cpp
+ optix/device_impl.h
+ optix/queue.cpp
+ optix/queue.h
+ optix/util.h
+)
set(SRC_HEADERS
device.h
- device_denoising.h
+ device_denoise.h
+ device_graphics_interop.h
device_memory.h
- device_intern.h
- device_network.h
- device_split_kernel.h
- device_task.h
+ device_kernel.h
+ device_queue.h
)
set(LIB
- cycles_render
cycles_kernel
cycles_util
${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
endif()
add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
- add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
- list(APPEND LIB
- extern_clew
- )
- add_definitions(-DWITH_OPENCL)
-endif()
+
if(WITH_CYCLES_DEVICE_CUDA)
add_definitions(-DWITH_CUDA)
endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
endif()
if(WITH_OPENIMAGEDENOISE)
- add_definitions(-DWITH_OPENIMAGEDENOISE)
- add_definitions(-DOIDN_STATIC_LIB)
- list(APPEND INC_SYS
- ${OPENIMAGEDENOISE_INCLUDE_DIRS}
- )
list(APPEND LIB
${OPENIMAGEDENOISE_LIBRARIES}
- ${TBB_LIBRARIES}
)
endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+ ${SRC}
+ ${SRC_CPU}
+ ${SRC_CUDA}
+ ${SRC_DUMMY}
+ ${SRC_MULTI}
+ ${SRC_OPTIX}
+ ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+ DeviceInfo info;
+
+ info.type = DEVICE_CPU;
+ info.description = system_cpu_brand_string();
+ info.id = "CPU";
+ info.num = 0;
+ info.has_osl = true;
+ info.has_half_images = true;
+ info.has_nanovdb = true;
+ info.has_profiling = true;
+ if (openimagedenoise_supported()) {
+ info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+ }
+
+ devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+ string capabilities = "";
+ capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+ capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+ capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+ capabilities += system_cpu_support_avx() ? "AVX " : "";
+ capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+ if (capabilities[capabilities.size() - 1] == ' ')
+ capabilities.resize(capabilities.size() - 1);
+ return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/device/cpu/device.h
index dcea2630aef..9cb2e80068d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/device/cpu/device.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,13 +14,22 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_buffer_update.h"
+#pragma once
-#define KERNEL_NAME buffer_update
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+# include "util/util_windows.h"
+# include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+# include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+ /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+ * optimization. */
+ VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+ << " kernels.";
+
+ if (info.cpu_threads == 0) {
+ info.cpu_threads = TaskScheduler::num_threads();
+ }
+
+#ifdef WITH_OSL
+ kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+ embree_device = rtcNewDevice("verbose=0");
+#endif
+ need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+ rtcReleaseDevice(embree_device);
+#endif
+
+ texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+ return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+ bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+ return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+ if (!need_texture_info) {
+ return false;
+ }
+
+ texture_info.copy_to_device();
+ need_texture_info = false;
+
+ return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ assert(!"mem_alloc not supported for global memory.");
+ }
+ else {
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ if (mem.type == MEM_DEVICE_ONLY) {
+ assert(!mem.host_pointer);
+ size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+ void *data = util_aligned_malloc(mem.memory_size(), alignment);
+ mem.device_pointer = (device_ptr)data;
+ }
+ else {
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ }
+
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+ }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ /* copy is no-op */
+ }
+}
+
+void CPUDevice::mem_copy_from(
+ device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+ /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ if (mem.device_pointer) {
+ memset((void *)mem.device_pointer, 0, mem.memory_size());
+ }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else if (mem.device_pointer) {
+ if (mem.type == MEM_DEVICE_ONLY) {
+ util_aligned_free((void *)mem.device_pointer);
+ }
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ // Update scene handle (since it is different for each device on multi devices)
+ KernelData *const data = (KernelData *)host;
+ data->bvh.scene = embree_scene;
+ }
+#endif
+ kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+ VLOG(1) << "Global memory allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+ VLOG(1) << "Texture allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount of re-allocations. */
+ texture_info.resize(slot + 128);
+ }
+
+ texture_info[slot] = mem.info;
+ texture_info[slot].data = (uint64_t)mem.host_pointer;
+ need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ need_texture_info = true;
+ }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+ if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+ bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+ BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+ if (refit) {
+ bvh_embree->refit(progress);
+ }
+ else {
+ bvh_embree->build(progress, &stats, embree_device);
+ }
+
+ if (bvh->params.top_level) {
+ embree_scene = bvh_embree->scene;
+ }
+ }
+ else
+#endif
+ Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+ const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+ scoped_timer timer(&tile.buffers->render_time);
+
+ Coverage coverage(kg, tile);
+ if (use_coverage) {
+ coverage.init_path_trace();
+ }
+
+ float *render_buffer = (float *)tile.buffer;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
+
+ /* Needed for Embree. */
+ SIMD_SET_FLUSH_TO_ZERO;
+
+ for (int sample = start_sample; sample < end_sample; sample++) {
+ if (task.get_cancel() || TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+
+ if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+ tile.stealing_state = RenderTile::WAS_STOLEN;
+ break;
+ }
+
+ if (tile.task == RenderTile::PATH_TRACE) {
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ if (use_coverage) {
+ coverage.init_pixel(x, y);
+ }
+ kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+ }
+ }
+ }
+ else {
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+ }
+ }
+ }
+ tile.sample = sample + 1;
+
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+ const bool stop = adaptive_sampling_filter(kg, tile, sample);
+ if (stop) {
+ const int num_progress_samples = end_sample - sample;
+ tile.sample = end_sample;
+ task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+ break;
+ }
+ }
+
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+ if (use_coverage) {
+ coverage.finalize();
+ }
+
+ if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+ adaptive_sampling_post(tile, kg);
+ }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+ if (TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ return;
+ }
+
+ /* allocate buffer for kernel globals */
+ CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+ profiler.add_state(&kg.profiler);
+
+ /* NLM denoiser. */
+ DenoisingTask *denoising = NULL;
+
+ /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+ * avoid waiting with mutex locks in the denoiser, we let only a single
+ * thread acquire denoising tiles. */
+ uint tile_types = task.tile_types;
+ bool hold_denoise_lock = false;
+ if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+ if (!oidn_task_lock.try_lock()) {
+ tile_types &= ~RenderTile::DENOISE;
+ hold_denoise_lock = true;
+ }
+ }
+
+ RenderTile tile;
+ while (task.acquire_tile(this, tile, tile_types)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ render(task, tile, &kg);
+ }
+ else if (tile.task == RenderTile::BAKE) {
+ render(task, tile, &kg);
+ }
+ else if (tile.task == RenderTile::DENOISE) {
+ denoise_openimagedenoise(task, tile);
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+
+ task.release_tile(tile);
+
+ if (TaskPool::canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ if (hold_denoise_lock) {
+ oidn_task_lock.unlock();
+ }
+
+ profiler.remove_state(&kg.profiler);
+
+ delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+ RenderTile tile;
+ tile.x = task.x;
+ tile.y = task.y;
+ tile.w = task.w;
+ tile.h = task.h;
+ tile.buffer = task.buffer;
+ tile.sample = task.sample + task.num_samples;
+ tile.num_samples = task.num_samples;
+ tile.start_sample = task.sample;
+ tile.offset = task.offset;
+ tile.stride = task.stride;
+ tile.buffers = task.buffers;
+
+ denoise_openimagedenoise(task, tile);
+
+ task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+ return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+ /* Ensure latest texture info is loaded into kernel globals before returning. */
+ load_texture_info();
+
+ kernel_thread_globals.clear();
+ void *osl_memory = get_cpu_osl_memory();
+ for (int i = 0; i < info.cpu_threads; i++) {
+ kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+ }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+ return &osl_globals;
+#else
+ return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+# include "util/util_windows.h"
+# include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+# include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+ KernelGlobals kernel_globals;
+
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+
+#ifdef WITH_OSL
+ OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+ RTCScene embree_scene = NULL;
+ RTCDevice embree_device;
+#endif
+
+ CPUKernels kernels;
+
+ CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+ ~CPUDevice();
+
+ virtual bool show_samples() const override;
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ /* Returns true if the texture info was copied to the device (meaning, some more
+ * re-initialization might be needed). */
+ bool load_texture_info();
+
+ virtual void mem_alloc(device_memory &mem) override;
+ virtual void mem_copy_to(device_memory &mem) override;
+ virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+ virtual void mem_zero(device_memory &mem) override;
+ virtual void mem_free(device_memory &mem) override;
+ virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+ void tex_free(device_texture &mem);
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+ virtual const CPUKernels *get_cpu_kernels() const override;
+ virtual void get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+ virtual void *get_cpu_osl_memory() override;
+
+ protected:
+ virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..91282390e27
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+ KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+ : /* Integrator. */
+ REGISTER_KERNEL(integrator_init_from_camera),
+ REGISTER_KERNEL(integrator_init_from_bake),
+ REGISTER_KERNEL(integrator_intersect_closest),
+ REGISTER_KERNEL(integrator_intersect_shadow),
+ REGISTER_KERNEL(integrator_intersect_subsurface),
+ REGISTER_KERNEL(integrator_intersect_volume_stack),
+ REGISTER_KERNEL(integrator_shade_background),
+ REGISTER_KERNEL(integrator_shade_light),
+ REGISTER_KERNEL(integrator_shade_shadow),
+ REGISTER_KERNEL(integrator_shade_surface),
+ REGISTER_KERNEL(integrator_shade_volume),
+ REGISTER_KERNEL(integrator_megakernel),
+ /* Shader evaluation. */
+ REGISTER_KERNEL(shader_eval_displace),
+ REGISTER_KERNEL(shader_eval_background),
+ /* Adaptive sampling. */
+ REGISTER_KERNEL(adaptive_sampling_convergence_check),
+ REGISTER_KERNEL(adaptive_sampling_filter_x),
+ REGISTER_KERNEL(adaptive_sampling_filter_y),
+ /* Cryptomatte. */
+ REGISTER_KERNEL(cryptomatte_postprocess),
+ /* Bake. */
+ REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+ /* Integrator. */
+
+ using IntegratorFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+ using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+ using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+ IntegratorStateCPU *state,
+ KernelWorkTile *tile,
+ ccl_global float *render_buffer)>;
+
+ IntegratorInitFunction integrator_init_from_camera;
+ IntegratorInitFunction integrator_init_from_bake;
+ IntegratorFunction integrator_intersect_closest;
+ IntegratorFunction integrator_intersect_shadow;
+ IntegratorFunction integrator_intersect_subsurface;
+ IntegratorFunction integrator_intersect_volume_stack;
+ IntegratorShadeFunction integrator_shade_background;
+ IntegratorShadeFunction integrator_shade_light;
+ IntegratorShadeFunction integrator_shade_shadow;
+ IntegratorShadeFunction integrator_shade_surface;
+ IntegratorShadeFunction integrator_shade_volume;
+ IntegratorShadeFunction integrator_megakernel;
+
+ /* Shader evaluation. */
+
+ using ShaderEvalFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+ ShaderEvalFunction shader_eval_displace;
+ ShaderEvalFunction shader_eval_background;
+
+ /* Adaptive stopping. */
+
+ using AdaptiveSamplingConvergenceCheckFunction =
+ CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)>;
+
+ using AdaptiveSamplingFilterXFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)>;
+
+ using AdaptiveSamplingFilterYFunction =
+ CPUKernelFunction<void (*)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)>;
+
+ AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+ AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+ AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+ /* Cryptomatte. */
+
+ using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+ const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+ CryptomattePostprocessFunction cryptomatte_postprocess;
+
+ /* Bake. */
+
+ CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+ CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+ CPUKernelFunction(FunctionType kernel_default,
+ FunctionType kernel_sse2,
+ FunctionType kernel_sse3,
+ FunctionType kernel_sse41,
+ FunctionType kernel_avx,
+ FunctionType kernel_avx2)
+ {
+ kernel_info_ = get_best_kernel_info(
+ kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+ }
+
+ template<typename... Args> inline auto operator()(Args... args) const
+ {
+ assert(kernel_info_.kernel);
+
+ return kernel_info_.kernel(args...);
+ }
+
+ const char *get_uarch_name() const
+ {
+ return kernel_info_.uarch_name;
+ }
+
+ protected:
+ /* Helper class which allows to pass human-readable microarchitecture name together with function
+ * pointer. */
+ class KernelInfo {
+ public:
+ KernelInfo() : KernelInfo("", nullptr)
+ {
+ }
+
+ /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+ * memory allocation. */
+ KernelInfo(const char *uarch_name, FunctionType kernel)
+ : uarch_name(uarch_name), kernel(kernel)
+ {
+ }
+
+ const char *uarch_name;
+ FunctionType kernel;
+ };
+
+ KernelInfo get_best_kernel_info(FunctionType kernel_default,
+ FunctionType kernel_sse2,
+ FunctionType kernel_sse3,
+ FunctionType kernel_sse41,
+ FunctionType kernel_avx,
+ FunctionType kernel_avx2)
+ {
+ /* Silence warnings about unused variables when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+ return KernelInfo("AVX2", kernel_avx2);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+ return KernelInfo("AVX", kernel_avx);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+ return KernelInfo("SSE4.1", kernel_sse41);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+ return KernelInfo("SSE3", kernel_sse3);
+ }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+ return KernelInfo("SSE2", kernel_sse2);
+ }
+#endif
+
+ return KernelInfo("default", kernel_default);
+ }
+
+ KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+ void *osl_globals_memory,
+ Profiler &cpu_profiler)
+ : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+ reset_runtime_memory();
+
+#ifdef WITH_OSL
+ OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+ (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+ : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+ other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+ OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+ if (this == &other) {
+ return *this;
+ }
+
+ *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+ other.reset_runtime_memory();
+
+ return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+ osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+ cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+ cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+ /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+ * without OSL support. Will avoid need to those unnamed pointers and casts. */
+ CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+ void *osl_globals_memory,
+ Profiler &cpu_profiler);
+
+ ~CPUKernelThreadGlobals();
+
+ CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+ CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+ CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+ CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+ void start_profiling();
+ void stop_profiling();
+
+ protected:
+ void reset_runtime_memory();
+
+ Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
* limitations under the License.
*/
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
-# include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+# include "device/cuda/device_impl.h"
# include "device/device.h"
-# include "device/device_intern.h"
-# include "util/util_logging.h"
# include "util/util_string.h"
# include "util/util_windows.h"
+#endif /* WITH_CUDA */
CCL_NAMESPACE_BEGIN
bool device_cuda_init()
{
-# ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+ return false;
+#elif defined(WITH_CUDA_DYNLOAD)
static bool initialized = false;
static bool result = false;
@@ -59,16 +63,27 @@ bool device_cuda_init()
}
return result;
-# else /* WITH_CUDA_DYNLOAD */
+#else /* WITH_CUDA_DYNLOAD */
return true;
-# endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
}
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
- return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+ return new CUDADevice(info, stats, profiler);
+#else
+ (void)info;
+ (void)stats;
+ (void)profiler;
+
+ LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+ return nullptr;
+#endif
}
+#ifdef WITH_CUDA
static CUresult device_cuda_safe_init()
{
# ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
return cuInit(0);
# endif
}
+#endif /* WITH_CUDA */
void device_cuda_info(vector<DeviceInfo> &devices)
{
+#ifdef WITH_CUDA
CUresult result = device_cuda_safe_init();
if (result != CUDA_SUCCESS) {
if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
info.has_half_images = (major >= 3);
info.has_nanovdb = true;
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.denoisers = DENOISER_NLM;
+ info.denoisers = 0;
+
+ info.has_gpu_queue = true;
/* Check if the device has P2P access to any other device in the system. */
for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
if (!display_devices.empty())
devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else /* WITH_CUDA */
+ (void)devices;
+#endif /* WITH_CUDA */
}
string device_cuda_capabilities()
{
+#ifdef WITH_CUDA
CUresult result = device_cuda_safe_init();
if (result != CUDA_SUCCESS) {
if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
}
return capabilities;
+
+#else /* WITH_CUDA */
+ return "";
+#endif /* WITH_CUDA */
}
CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/device/cuda/device.h
index e68d4104a91..b0484904d1a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ b/intern/cycles/device/cuda/device.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,13 +14,24 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
+#pragma once
-#define KERNEL_NAME enqueue_inactive
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-# include "device/device.h"
-# include "device/device_denoising.h"
-# include "device/device_split_kernel.h"
-
-# include "util/util_map.h"
-# include "util/util_task.h"
-
-# ifdef WITH_CUDA_DYNLOAD
-# include "cuew.h"
-# else
-# include "util/util_opengl.h"
-# include <cuda.h>
-# include <cudaGL.h>
-# endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
- friend class CUDASplitKernelFunction;
- friend class CUDASplitKernel;
- friend class CUDAContextScope;
-
- public:
- DedicatedTaskPool task_pool;
- CUdevice cuDevice;
- CUcontext cuContext;
- CUmodule cuModule, cuFilterModule;
- size_t device_texture_headroom;
- size_t device_working_headroom;
- bool move_texture_to_host;
- size_t map_host_used;
- size_t map_host_limit;
- int can_map_host;
- int pitch_alignment;
- int cuDevId;
- int cuDevArchitecture;
- bool first_error;
- CUDASplitKernel *split_kernel;
-
- struct CUDAMem {
- CUDAMem() : texobject(0), array(0), use_mapped_host(false)
- {
- }
-
- CUtexObject texobject;
- CUarray array;
-
- /* If true, a mapped host memory in shared_pointer is being used. */
- bool use_mapped_host;
- };
- typedef map<device_memory *, CUDAMem> CUDAMemMap;
- CUDAMemMap cuda_mem_map;
- thread_mutex cuda_mem_map_mutex;
-
- struct PixelMem {
- GLuint cuPBO;
- CUgraphicsResource cuPBOresource;
- GLuint cuTexId;
- int w, h;
- };
- map<device_ptr, PixelMem> pixel_mem_map;
-
- /* Bindless Textures */
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
-
- /* Kernels */
- struct {
- bool loaded;
-
- CUfunction adaptive_stopping;
- CUfunction adaptive_filter_x;
- CUfunction adaptive_filter_y;
- CUfunction adaptive_scale_samples;
- int adaptive_num_threads_per_block;
- } functions;
-
- static bool have_precompiled_kernels();
-
- virtual bool show_samples() const override;
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
- void set_error(const string &error) override;
-
- CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
- virtual ~CUDADevice();
-
- bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
- bool check_peer_access(Device *peer_device) override;
-
- bool use_adaptive_compilation();
-
- bool use_split_kernel();
-
- virtual string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
- string compile_kernel(const DeviceRequestedFeatures &requested_features,
- const char *name,
- const char *base = "cuda",
- bool force_ptx = false);
-
- virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
- void load_functions();
-
- void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
- void init_host_memory();
-
- void load_texture_info();
-
- void move_textures_to_host(size_t size, bool for_texture);
-
- CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
- void generic_copy_to(device_memory &mem);
-
- void generic_free(device_memory &mem);
-
- void mem_alloc(device_memory &mem) override;
-
- void mem_copy_to(device_memory &mem) override;
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
- void mem_zero(device_memory &mem) override;
-
- void mem_free(device_memory &mem) override;
-
- device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
- virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
- void global_alloc(device_memory &mem);
-
- void global_free(device_memory &mem);
-
- void tex_alloc(device_texture &mem);
-
- void tex_free(device_texture &mem);
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task);
-
- bool denoising_construct_transform(DenoisingTask *task);
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task);
-
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task);
-
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task);
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task);
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task);
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
-
- void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
- void adaptive_sampling_filter(uint filter_sample,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream = 0);
- void adaptive_sampling_post(RenderTile &rtile,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream = 0);
-
- void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
- void film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half);
-
- void shader(DeviceTask &task);
-
- CUdeviceptr map_pixels(device_ptr mem);
-
- void unmap_pixels(device_ptr mem);
-
- void pixels_alloc(device_memory &mem);
-
- void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
- void pixels_free(device_memory &mem);
-
- void draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params) override;
-
- void thread_run(DeviceTask &task);
-
- virtual void task_add(DeviceTask &task) override;
-
- virtual void task_wait() override;
-
- virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-# include <climits>
-# include <limits.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <string.h>
-
-# include "device/cuda/device_cuda.h"
-# include "device/device_intern.h"
-# include "device/device_split_kernel.h"
-
-# include "render/buffers.h"
-
-# include "kernel/filter/filter_defines.h"
-
-# include "util/util_debug.h"
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_map.h"
-# include "util/util_md5.h"
-# include "util/util_opengl.h"
-# include "util/util_path.h"
-# include "util/util_string.h"
-# include "util/util_system.h"
-# include "util/util_time.h"
-# include "util/util_types.h"
-# include "util/util_windows.h"
-
-# include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-# ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
- /* We can only give error code here without major code duplication, that
- * should be enough since dynamic loading is only being disabled by folks
- * who knows what they're doing anyway.
- *
- * NOTE: Avoid call from several threads.
- */
- static string error;
- error = string_printf("%d", result);
- return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
- return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
- return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-# endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
- CUDADevice *device;
-
- public:
- explicit CUDASplitKernel(CUDADevice *device);
-
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs);
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
- CUDAContextScope(CUDADevice *device);
- ~CUDAContextScope();
-
- private:
- CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
- string cubins_path = path_get("lib");
- return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
- /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
- return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
- return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
- Device::set_error(error);
-
- if (first_error) {
- fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
- fprintf(stderr,
- "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
- first_error = false;
- }
-}
-
-# define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- if (result != CUDA_SUCCESS) { \
- const char *name = cuewErrorString(result); \
- set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
- } \
- } \
- (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
- first_error = true;
- background = background_;
-
- cuDevId = info.num;
- cuDevice = 0;
- cuContext = 0;
-
- cuModule = 0;
- cuFilterModule = 0;
-
- split_kernel = NULL;
-
- need_texture_info = false;
-
- device_texture_headroom = 0;
- device_working_headroom = 0;
- move_texture_to_host = false;
- map_host_limit = 0;
- map_host_used = 0;
- can_map_host = 0;
- pitch_alignment = 0;
-
- functions.loaded = false;
-
- /* Initialize CUDA. */
- CUresult result = cuInit(0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
- return;
- }
-
- /* Setup device and context. */
- result = cuDeviceGet(&cuDevice, cuDevId);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
- cuewErrorString(result)));
- return;
- }
-
- /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
- * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
- * so we can predict which memory to map to host. */
- cuda_assert(
- cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
- cuda_assert(cuDeviceGetAttribute(
- &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
- unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
- if (can_map_host) {
- ctx_flags |= CU_CTX_MAP_HOST;
- init_host_memory();
- }
-
- /* Create context. */
- if (background) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- }
- else {
- result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
- if (result != CUDA_SUCCESS) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- background = true;
- }
- }
-
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
- return;
- }
-
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
- cuDevArchitecture = major * 100 + minor * 10;
-
- /* Pop context set by cuCtxCreate. */
- cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
- task_pool.cancel();
-
- delete split_kernel;
-
- texture_info.free();
-
- cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* We only support sm_30 and above */
- if (major < 3) {
- set_error(string_printf(
- "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
- return false;
- }
-
- return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
- if (peer_device == this) {
- return false;
- }
- if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
- return false;
- }
-
- CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
- int can_access = 0;
- cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
- if (can_access == 0) {
- return false;
- }
-
- // Ensure array access over the link is possible as well (for 3D textures)
- cuda_assert(cuDeviceGetP2PAttribute(&can_access,
- CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
- cuDevice,
- peer_device_cuda->cuDevice));
- if (can_access == 0) {
- return false;
- }
-
- // Enable peer access in both directions
- {
- const CUDAContextScope scope(this);
- CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
- cuewErrorString(result)));
- return false;
- }
- }
- {
- const CUDAContextScope scope(peer_device_cuda);
- CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
- if (result != CUDA_SUCCESS) {
- set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
- cuewErrorString(result)));
- return false;
- }
- }
-
- return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
- return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
- return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
- const int machine = system_cpu_bits();
- const string source_path = path_get("source");
- const string include_path = source_path;
- string cflags = string_printf(
- "-m%d "
- "--ptxas-options=\"-v\" "
- "--use_fast_math "
- "-DNVCC "
- "-I\"%s\"",
- machine,
- include_path.c_str());
- if (!filter && use_adaptive_compilation()) {
- cflags += " " + requested_features.get_build_options();
- }
- const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
- if (extra_cflags) {
- cflags += string(" ") + string(extra_cflags);
- }
-
- if (split) {
- cflags += " -D__SPLIT__";
- }
-
-# ifdef WITH_NANOVDB
- cflags += " -DWITH_NANOVDB";
-# endif
-
- return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
- const char *name,
- const char *base,
- bool force_ptx)
-{
- /* Compute kernel name. */
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* Attempt to use kernel provided with Blender. */
- if (!use_adaptive_compilation()) {
- if (!force_ptx) {
- const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
- VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
- if (path_exists(cubin)) {
- VLOG(1) << "Using precompiled kernel.";
- return cubin;
- }
- }
-
- /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
- int ptx_major = major, ptx_minor = minor;
- while (ptx_major >= 3) {
- const string ptx = path_get(
- string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
- VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
- if (path_exists(ptx)) {
- VLOG(1) << "Using precompiled kernel.";
- return ptx;
- }
-
- if (ptx_minor > 0) {
- ptx_minor--;
- }
- else {
- ptx_major--;
- ptx_minor = 9;
- }
- }
- }
-
- /* Try to use locally compiled kernel. */
- string source_path = path_get("source");
- const string source_md5 = path_files_md5_hash(source_path);
-
- /* We include cflags into md5 so changing cuda toolkit or changing other
- * compiler command line arguments makes sure cubin gets re-built.
- */
- string common_cflags = compile_kernel_get_common_cflags(
- requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
- const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
- const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
- const char *const kernel_arch = force_ptx ? "compute" : "sm";
- const string cubin_file = string_printf(
- "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
- const string cubin = path_cache_get(path_join("kernels", cubin_file));
- VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
- if (path_exists(cubin)) {
- VLOG(1) << "Using locally compiled kernel.";
- return cubin;
- }
-
-# ifdef _WIN32
- if (!use_adaptive_compilation() && have_precompiled_kernels()) {
- if (major < 3) {
- set_error(
- string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
- "Your GPU is not supported.",
- major,
- minor));
- }
- else {
- set_error(
- string_printf("CUDA binary kernel for this graphics card compute "
- "capability (%d.%d) not found.",
- major,
- minor));
- }
- return string();
- }
-# endif
-
- /* Compile. */
- const char *const nvcc = cuewCompilerPath();
- if (nvcc == NULL) {
- set_error(
- "CUDA nvcc compiler not found. "
- "Install CUDA toolkit in default location.");
- return string();
- }
-
- const int nvcc_cuda_version = cuewCompilerVersion();
- VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
- if (nvcc_cuda_version < 101) {
- printf(
- "Unsupported CUDA version %d.%d detected, "
- "you need CUDA 10.1 or newer.\n",
- nvcc_cuda_version / 10,
- nvcc_cuda_version % 10);
- return string();
- }
- else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
- nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
- printf(
- "CUDA version %d.%d detected, build may succeed but only "
- "CUDA 10.1 to 11.4 are officially supported.\n",
- nvcc_cuda_version / 10,
- nvcc_cuda_version % 10);
- }
-
- double starttime = time_dt();
-
- path_create_directories(cubin);
-
- source_path = path_join(path_join(source_path, "kernel"),
- path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
- string command = string_printf(
- "\"%s\" "
- "-arch=%s_%d%d "
- "--%s \"%s\" "
- "-o \"%s\" "
- "%s",
- nvcc,
- kernel_arch,
- major,
- minor,
- kernel_ext,
- source_path.c_str(),
- cubin.c_str(),
- common_cflags.c_str());
-
- printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-# ifdef _WIN32
- command = "call " + command;
-# endif
- if (system(command.c_str()) != 0) {
- set_error(
- "Failed to execute compilation command, "
- "see console for details.");
- return string();
- }
-
- /* Verify if compilation succeeded */
- if (!path_exists(cubin)) {
- set_error(
- "CUDA kernel compilation failed, "
- "see console for details.");
- return string();
- }
-
- printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
- return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
- /* TODO(sergey): Support kernels re-load for CUDA devices.
- *
- * Currently re-loading kernel will invalidate memory pointers,
- * causing problems in cuCtxSynchronize.
- */
- if (cuFilterModule && cuModule) {
- VLOG(1) << "Skipping kernel reload, not currently supported.";
- return true;
- }
-
- /* check if cuda init succeeded */
- if (cuContext == 0)
- return false;
-
- /* check if GPU is supported */
- if (!support_device(requested_features))
- return false;
-
- /* get kernel */
- const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
- string cubin = compile_kernel(requested_features, kernel_name);
- if (cubin.empty())
- return false;
-
- const char *filter_name = "filter";
- string filter_cubin = compile_kernel(requested_features, filter_name);
- if (filter_cubin.empty())
- return false;
-
- /* open module */
- CUDAContextScope scope(this);
-
- string cubin_data;
- CUresult result;
-
- if (path_read_text(cubin, cubin_data))
- result = cuModuleLoadData(&cuModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if (result != CUDA_SUCCESS)
- set_error(string_printf(
- "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
- if (path_read_text(filter_cubin, cubin_data))
- result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if (result != CUDA_SUCCESS)
- set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
- filter_cubin.c_str(),
- cuewErrorString(result)));
-
- if (result == CUDA_SUCCESS) {
- reserve_local_memory(requested_features);
- }
-
- load_functions();
-
- return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
- /* TODO: load all functions here. */
- if (functions.loaded) {
- return;
- }
- functions.loaded = true;
-
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
- cuda_assert(cuModuleGetFunction(
- &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
- int unused_min_blocks;
- cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
- &functions.adaptive_num_threads_per_block,
- functions.adaptive_scale_samples,
- NULL,
- 0,
- 0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
- if (use_split_kernel()) {
- /* Split kernel mostly uses global memory and adaptive compilation,
- * difficult to predict how much is needed currently. */
- return;
- }
-
- /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
- * needed for kernel launches, so that we can reliably figure out when
- * to allocate scene data in mapped host memory. */
- CUDAContextScope scope(this);
-
- size_t total = 0, free_before = 0, free_after = 0;
- cuMemGetInfo(&free_before, &total);
-
- /* Get kernel function. */
- CUfunction cuRender;
-
- if (requested_features.use_baking) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
- }
- else if (requested_features.use_integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
- int min_blocks, num_threads_per_block;
- cuda_assert(
- cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
- /* Launch kernel, using just 1 block appears sufficient to reserve
- * memory for all multiprocessors. It would be good to do this in
- * parallel for the multi GPU case still to make it faster. */
- CUdeviceptr d_work_tiles = 0;
- uint total_work_size = 0;
-
- void *args[] = {&d_work_tiles, &total_work_size};
-
- cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
- cuda_assert(cuCtxSynchronize());
-
- cuMemGetInfo(&free_after, &total);
- VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
- << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-# if 0
- /* For testing mapped host memory, fill up device memory. */
- const size_t keep_mb = 1024;
-
- while (free_after > keep_mb * 1024 * 1024LL) {
- CUdeviceptr tmp;
- cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
- cuMemGetInfo(&free_after, &total);
- }
-# endif
-}
-
-void CUDADevice::init_host_memory()
-{
- /* Limit amount of host mapped memory, because allocating too much can
- * cause system instability. Leave at least half or 4 GB of system
- * memory free, whichever is smaller. */
- size_t default_limit = 4 * 1024 * 1024 * 1024LL;
- size_t system_ram = system_physical_ram();
-
- if (system_ram > 0) {
- if (system_ram / 2 > default_limit) {
- map_host_limit = system_ram - default_limit;
- }
- else {
- map_host_limit = system_ram / 2;
- }
- }
- else {
- VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
- map_host_limit = 0;
- }
-
- /* Amount of device memory to keep is free after texture memory
- * and working memory allocations respectively. We set the working
- * memory limit headroom lower so that some space is left after all
- * texture memory allocations. */
- device_working_headroom = 32 * 1024 * 1024LL; // 32MB
- device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
-
- VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
- << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
- if (need_texture_info) {
- /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
- * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
- need_texture_info = false;
- texture_info.copy_to_device();
- }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
- /* Break out of recursive call, which can happen when moving memory on a multi device. */
- static bool any_device_moving_textures_to_host = false;
- if (any_device_moving_textures_to_host) {
- return;
- }
-
- /* Signal to reallocate textures in host memory only. */
- move_texture_to_host = true;
-
- while (size > 0) {
- /* Find suitable memory allocation to move. */
- device_memory *max_mem = NULL;
- size_t max_size = 0;
- bool max_is_image = false;
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
- device_memory &mem = *pair.first;
- CUDAMem *cmem = &pair.second;
-
- /* Can only move textures allocated on this device (and not those from peer devices).
- * And need to ignore memory that is already on the host. */
- if (!mem.is_resident(this) || cmem->use_mapped_host) {
- continue;
- }
-
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
- (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- /* Can't move this type of memory. */
- if (!is_texture || cmem->array) {
- continue;
- }
-
- /* For other textures, only move image textures. */
- if (for_texture && !is_image) {
- continue;
- }
-
- /* Try to move largest allocation, prefer moving images. */
- if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
- max_is_image = is_image;
- max_size = mem.device_size;
- max_mem = &mem;
- }
- }
- lock.unlock();
-
- /* Move to host memory. This part is mutex protected since
- * multiple CUDA devices could be moving the memory. The
- * first one will do it, and the rest will adopt the pointer. */
- if (max_mem) {
- VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
- static thread_mutex move_mutex;
- thread_scoped_lock lock(move_mutex);
-
- any_device_moving_textures_to_host = true;
-
- /* Potentially need to call back into multi device, so pointer mapping
- * and peer devices are updated. This is also necessary since the device
- * pointer may just be a key here, so cannot be accessed and freed directly.
- * Unfortunately it does mean that memory is reallocated on all other
- * devices as well, which is potentially dangerous when still in use (since
- * a thread rendering on another devices would only be caught in this mutex
- * if it so happens to do an allocation at the same time as well. */
- max_mem->device_copy_to();
- size = (max_size >= size) ? 0 : size - max_size;
-
- any_device_moving_textures_to_host = false;
- }
- else {
- break;
- }
- }
-
- /* Unset flag before texture info is reloaded, since it should stay in device memory. */
- move_texture_to_host = false;
-
- /* Update texture info array with new pointers. */
- load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
- CUDAContextScope scope(this);
-
- CUdeviceptr device_pointer = 0;
- size_t size = mem.memory_size() + pitch_padding;
-
- CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
- const char *status = "";
-
- /* First try allocating in device memory, respecting headroom. We make
- * an exception for texture info. It is small and frequently accessed,
- * so treat it as working memory.
- *
- * If there is not enough room for working memory, we will try to move
- * textures to host memory, assuming the performance impact would have
- * been worse for working memory. */
- bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
- size_t total = 0, free = 0;
- cuMemGetInfo(&free, &total);
-
- /* Move textures to host memory if needed. */
- if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
- move_textures_to_host(size + headroom - free, is_texture);
- cuMemGetInfo(&free, &total);
- }
-
- /* Allocate in device memory. */
- if (!move_texture_to_host && (size + headroom) < free) {
- mem_alloc_result = cuMemAlloc(&device_pointer, size);
- if (mem_alloc_result == CUDA_SUCCESS) {
- status = " in device memory";
- }
- }
-
- /* Fall back to mapped host memory if needed and possible. */
-
- void *shared_pointer = 0;
-
- if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
- if (mem.shared_pointer) {
- /* Another device already allocated host memory. */
- mem_alloc_result = CUDA_SUCCESS;
- shared_pointer = mem.shared_pointer;
- }
- else if (map_host_used + size < map_host_limit) {
- /* Allocate host memory ourselves. */
- mem_alloc_result = cuMemHostAlloc(
- &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
- assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
- (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
- }
-
- if (mem_alloc_result == CUDA_SUCCESS) {
- cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
- map_host_used += size;
- status = " in host memory";
- }
- }
-
- if (mem_alloc_result != CUDA_SUCCESS) {
- if (mem.type == MEM_DEVICE_ONLY) {
- status = " failed, out of device memory";
- set_error("System is out of GPU memory");
- }
- else {
- status = " failed, out of device and host memory";
- set_error("System is out of GPU and shared host memory");
- }
- }
-
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")" << status;
- }
-
- mem.device_pointer = (device_ptr)device_pointer;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- if (!mem.device_pointer) {
- return NULL;
- }
-
- /* Insert into map of allocations. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- CUDAMem *cmem = &cuda_mem_map[&mem];
- if (shared_pointer != 0) {
- /* Replace host pointer with our host allocation. Only works if
- * CUDA memory layout is the same and has no pitch padding. Also
- * does not work if we move textures to host during a render,
- * since other devices might be using the memory. */
-
- if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
- mem.host_pointer != shared_pointer) {
- memcpy(shared_pointer, mem.host_pointer, size);
-
- /* A Call to device_memory::host_free() should be preceded by
- * a call to device_memory::device_free() for host memory
- * allocated by a device to be handled properly. Two exceptions
- * are here and a call in OptiXDevice::generic_alloc(), where
- * the current host memory can be assumed to be allocated by
- * device_memory::host_alloc(), not by a device */
-
- mem.host_free();
- mem.host_pointer = shared_pointer;
- }
- mem.shared_pointer = shared_pointer;
- mem.shared_counter++;
- cmem->use_mapped_host = true;
- }
- else {
- cmem->use_mapped_host = false;
- }
-
- return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
- if (!mem.host_pointer || !mem.device_pointer) {
- return;
- }
-
- /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
- * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
- * mem.host_pointer. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(
- cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
- }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- CUDAContextScope scope(this);
- thread_scoped_lock lock(cuda_mem_map_mutex);
- const CUDAMem &cmem = cuda_mem_map[&mem];
-
- /* If cmem.use_mapped_host is true, reference counting is used
- * to safely free a mapped host memory. */
-
- if (cmem.use_mapped_host) {
- assert(mem.shared_pointer);
- if (mem.shared_pointer) {
- assert(mem.shared_counter > 0);
- if (--mem.shared_counter == 0) {
- if (mem.host_pointer == mem.shared_pointer) {
- mem.host_pointer = 0;
- }
- cuMemFreeHost(mem.shared_pointer);
- mem.shared_pointer = 0;
- }
- }
- map_host_used -= mem.device_size;
- }
- else {
- /* Free device memory. */
- cuda_assert(cuMemFree(mem.device_pointer));
- }
-
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else if (mem.type == MEM_GLOBAL) {
- assert(!"mem_alloc not supported for global memory.");
- }
- else {
- generic_alloc(mem);
- }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else {
- if (!mem.device_pointer) {
- generic_alloc(mem);
- }
- generic_copy_to(mem);
- }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_copy_from(mem, y, w, h);
- }
- else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
- assert(!"mem_copy_from not supported for textures.");
- }
- else if (mem.host_pointer) {
- const size_t size = elem * w * h;
- const size_t offset = elem * y * w;
-
- if (mem.device_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(cuMemcpyDtoH(
- (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
- }
- else {
- memset((char *)mem.host_pointer + offset, 0, size);
- }
- }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
- if (!mem.device_pointer) {
- return;
- }
-
- /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
- * regardless of mem.host_pointer and mem.shared_pointer. */
- thread_scoped_lock lock(cuda_mem_map_mutex);
- if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
- const CUDAContextScope scope(this);
- cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
- }
- else if (mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
- if (mem.type == MEM_PIXELS && !background) {
- pixels_free(mem);
- }
- else if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else {
- generic_free(mem);
- }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
- return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
- CUDAContextScope scope(this);
- CUdeviceptr mem;
- size_t bytes;
-
- cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
- // assert(bytes == size);
- cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
- if (mem.is_resident(this)) {
- generic_alloc(mem);
- generic_copy_to(mem);
- }
-
- const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
- if (mem.is_resident(this) && mem.device_pointer) {
- generic_free(mem);
- }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
- CUDAContextScope scope(this);
-
- /* General variables for both architectures */
- string bind_name = mem.name;
- size_t dsize = datatype_size(mem.data_type);
- size_t size = mem.memory_size();
-
- CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
- switch (mem.info.extension) {
- case EXTENSION_REPEAT:
- address_mode = CU_TR_ADDRESS_MODE_WRAP;
- break;
- case EXTENSION_EXTEND:
- address_mode = CU_TR_ADDRESS_MODE_CLAMP;
- break;
- case EXTENSION_CLIP:
- address_mode = CU_TR_ADDRESS_MODE_BORDER;
- break;
- default:
- assert(0);
- break;
- }
-
- CUfilter_mode filter_mode;
- if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
- filter_mode = CU_TR_FILTER_MODE_POINT;
- }
- else {
- filter_mode = CU_TR_FILTER_MODE_LINEAR;
- }
-
- /* Image Texture Storage */
- CUarray_format_enum format;
- switch (mem.data_type) {
- case TYPE_UCHAR:
- format = CU_AD_FORMAT_UNSIGNED_INT8;
- break;
- case TYPE_UINT16:
- format = CU_AD_FORMAT_UNSIGNED_INT16;
- break;
- case TYPE_UINT:
- format = CU_AD_FORMAT_UNSIGNED_INT32;
- break;
- case TYPE_INT:
- format = CU_AD_FORMAT_SIGNED_INT32;
- break;
- case TYPE_FLOAT:
- format = CU_AD_FORMAT_FLOAT;
- break;
- case TYPE_HALF:
- format = CU_AD_FORMAT_HALF;
- break;
- default:
- assert(0);
- return;
- }
-
- CUDAMem *cmem = NULL;
- CUarray array_3d = NULL;
- size_t src_pitch = mem.data_width * dsize * mem.data_elements;
- size_t dst_pitch = src_pitch;
-
- if (!mem.is_resident(this)) {
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
- cmem->texobject = 0;
-
- if (mem.data_depth > 1) {
- array_3d = (CUarray)mem.device_pointer;
- cmem->array = array_3d;
- }
- else if (mem.data_height > 0) {
- dst_pitch = align_up(src_pitch, pitch_alignment);
- }
- }
- else if (mem.data_depth > 1) {
- /* 3D texture using array, there is no API for linear memory. */
- CUDA_ARRAY3D_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Depth = mem.data_depth;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
- desc.Flags = 0;
-
- VLOG(1) << "Array 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
- if (!array_3d) {
- return;
- }
-
- CUDA_MEMCPY3D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = array_3d;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
- param.Depth = mem.data_depth;
-
- cuda_assert(cuMemcpy3D(&param));
-
- mem.device_pointer = (device_ptr)array_3d;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
- cmem->texobject = 0;
- cmem->array = array_3d;
- }
- else if (mem.data_height > 0) {
- /* 2D texture, using pitch aligned linear memory. */
- dst_pitch = align_up(src_pitch, pitch_alignment);
- size_t dst_size = dst_pitch * mem.data_height;
-
- cmem = generic_alloc(mem, dst_size - mem.memory_size());
- if (!cmem) {
- return;
- }
-
- CUDA_MEMCPY2D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
- param.dstDevice = mem.device_pointer;
- param.dstPitch = dst_pitch;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2DUnaligned(&param));
- }
- else {
- /* 1D texture, using linear memory. */
- cmem = generic_alloc(mem);
- if (!cmem) {
- return;
- }
-
- cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
- }
-
- /* Resize once */
- const uint slot = mem.slot;
- if (slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount
- * of re-allocations. */
- texture_info.resize(slot + 128);
- }
-
- /* Set Mapping and tag that we need to (re-)upload to device */
- texture_info[slot] = mem.info;
- need_texture_info = true;
-
- if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
- mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
- /* Kepler+, bindless textures. */
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(resDesc));
-
- if (array_3d) {
- resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- resDesc.res.array.hArray = array_3d;
- resDesc.flags = 0;
- }
- else if (mem.data_height > 0) {
- resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
- resDesc.res.pitch2D.devPtr = mem.device_pointer;
- resDesc.res.pitch2D.format = format;
- resDesc.res.pitch2D.numChannels = mem.data_elements;
- resDesc.res.pitch2D.height = mem.data_height;
- resDesc.res.pitch2D.width = mem.data_width;
- resDesc.res.pitch2D.pitchInBytes = dst_pitch;
- }
- else {
- resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
- resDesc.res.linear.devPtr = mem.device_pointer;
- resDesc.res.linear.format = format;
- resDesc.res.linear.numChannels = mem.data_elements;
- resDesc.res.linear.sizeInBytes = mem.device_size;
- }
-
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(texDesc));
- texDesc.addressMode[0] = address_mode;
- texDesc.addressMode[1] = address_mode;
- texDesc.addressMode[2] = address_mode;
- texDesc.filterMode = filter_mode;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- thread_scoped_lock lock(cuda_mem_map_mutex);
- cmem = &cuda_mem_map[&mem];
-
- cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
- texture_info[slot].data = (uint64_t)cmem->texobject;
- }
- else {
- texture_info[slot].data = (uint64_t)mem.device_pointer;
- }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
- if (mem.device_pointer) {
- CUDAContextScope scope(this);
- thread_scoped_lock lock(cuda_mem_map_mutex);
- const CUDAMem &cmem = cuda_mem_map[&mem];
-
- if (cmem.texobject) {
- /* Free bindless texture. */
- cuTexObjectDestroy(cmem.texobject);
- }
-
- if (!mem.is_resident(this)) {
- /* Do not free memory here, since it was allocated on a different device. */
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- else if (cmem.array) {
- /* Free array. */
- cuArrayDestroy(cmem.array);
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- else {
- lock.unlock();
- generic_free(mem);
- }
- }
-}
-
-# define CUDA_GET_BLOCKSIZE(func, w, h) \
- int threads_per_block; \
- cuda_assert( \
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int threads = (int)sqrt((float)threads_per_block); \
- int xblocks = ((w) + threads - 1) / threads; \
- int yblocks = ((h) + threads - 1) / threads;
-
-# define CUDA_LAUNCH_KERNEL(func, args) \
- cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
- int threads_per_block; \
- cuda_assert( \
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
- int yblocks = h;
-
-# define CUDA_LAUNCH_KERNEL_1D(func, args) \
- cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
- int frame_offset = 0;
-
- if (have_error())
- return false;
-
- CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
- CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
- CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
- CUdeviceptr scale_ptr = 0;
-
- cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
- cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
- void *calc_difference_args[] = {&guide_ptr,
- &variance_ptr,
- &scale_ptr,
- &difference,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &channel_offset,
- &frame_offset,
- &a,
- &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {
- &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference,
- &image_ptr,
- &out_ptr,
- &weightAccum,
- &w,
- &h,
- &stride,
- &pass_stride,
- &channel_offset,
- &r,
- &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
- }
-
- {
- CUfunction cuNLMNormalize;
- cuda_assert(
- cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
- cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
- void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
- CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
- CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
- cuda_assert(cuCtxSynchronize());
- }
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterConstructTransform;
- cuda_assert(cuModuleGetFunction(
- &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
- CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
- void *args[] = {&task->buffer.mem.device_pointer,
- &task->tile_info_mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->filter_area,
- &task->rect,
- &task->radius,
- &task->pca_threshold,
- &task->buffer.pass_stride,
- &task->buffer.frame_stride,
- &task->buffer.use_time};
- CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int r = task->radius;
- int f = 4;
- float a = 1.0f;
- float k_2 = task->nlm_k_2;
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
-
- if (have_error())
- return false;
-
- CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
- CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(
- &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(
- cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(
- &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr,
- &color_variance_ptr,
- &scale_ptr,
- &difference,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &pass_stride,
- &frame_offset,
- &a,
- &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&t,
- &blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w,
- &h,
- &stride,
- &pass_stride,
- &r,
- &f,
- &frame_offset,
- &task->buffer.use_time};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(
- cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterCombineHalves;
- cuda_assert(cuModuleGetFunction(
- &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
- CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDivideShadow;
- cuda_assert(cuModuleGetFunction(
- &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &a_ptr,
- &b_ptr,
- &sample_variance_ptr,
- &sv_variance_ptr,
- &buffer_variance_ptr,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterGetFeature;
- cuda_assert(
- cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &mean_offset,
- &variance_offset,
- &mean_ptr,
- &variance_ptr,
- &scale,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterWriteFeature;
- cuda_assert(cuModuleGetFunction(
- &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
- void *args[] = {&task->render_buffer.samples,
- &task->reconstruction_state.buffer_params,
- &task->filter_area,
- &from_ptr,
- &buffer_ptr,
- &out_offset,
- &task->rect};
- CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
-{
- if (have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDetectOutliers;
- cuda_assert(cuModuleGetFunction(
- &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(
- cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- void *args[] = {
- &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
- CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
- denoising.functions.construct_transform = function_bind(
- &CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream)
-{
- const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
- /* These are a series of tiny kernels because there is no grid synchronization
- * from within a kernel, so multiple kernel launches it is. */
- uint total_work_size = wtile->h * wtile->w;
- void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
- total_work_size = wtile->h;
- num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
- total_work_size = wtile->w;
- num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args2,
- 0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
- WorkTile *wtile,
- CUdeviceptr d_wtile,
- CUstream stream)
-{
- const int num_threads_per_block = functions.adaptive_num_threads_per_block;
- uint total_work_size = wtile->h * wtile->w;
-
- void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
- num_blocks,
- 1,
- 1,
- num_threads_per_block,
- 1,
- 1,
- 0,
- stream,
- args,
- 0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
- scoped_timer timer(&rtile.buffers->render_time);
-
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
- CUfunction cuRender;
-
- /* Get kernel function. */
- if (rtile.task == RenderTile::BAKE) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
- }
- else if (task.integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
- }
-
- if (have_error()) {
- return;
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
- /* Allocate work tile. */
- work_tiles.alloc(1);
-
- WorkTile *wtile = work_tiles.data();
- wtile->x = rtile.x;
- wtile->y = rtile.y;
- wtile->w = rtile.w;
- wtile->h = rtile.h;
- wtile->offset = rtile.offset;
- wtile->stride = rtile.stride;
- wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
- /* Prepare work size. More step samples render faster, but for now we
- * remain conservative for GPUs connected to a display to avoid driver
- * timeouts and display freezing. */
- int min_blocks, num_threads_per_block;
- cuda_assert(
- cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
- if (!info.display_device) {
- min_blocks *= 8;
- }
-
- uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
- /* Render all samples. */
- int start_sample = rtile.start_sample;
- int end_sample = rtile.start_sample + rtile.num_samples;
-
- for (int sample = start_sample; sample < end_sample;) {
- /* Setup and copy work tile to device. */
- wtile->start_sample = sample;
- wtile->num_samples = step_samples;
- if (task.adaptive_sampling.use) {
- wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
- }
- wtile->num_samples = min(wtile->num_samples, end_sample - sample);
- work_tiles.copy_to_device();
-
- CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
- uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
- /* Launch kernel. */
- void *args[] = {&d_work_tiles, &total_work_size};
-
- cuda_assert(
- cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
- /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
- uint filter_sample = sample + wtile->num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
- }
-
- cuda_assert(cuCtxSynchronize());
-
- /* Update progress. */
- sample += wtile->num_samples;
- rtile.sample = sample;
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- /* Finalize adaptive sampling. */
- if (task.adaptive_sampling.use) {
- CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
- adaptive_sampling_post(rtile, wtile, d_work_tiles);
- cuda_assert(cuCtxSynchronize());
- task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
- }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half)
-{
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilmConvert;
- CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
- CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
- /* get kernel function */
- if (rgba_half) {
- cuda_assert(
- cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
- }
-
- float sample_scale = 1.0f / (task.sample + 1);
-
- /* pass in parameters */
- void *args[] = {&d_rgba,
- &d_buffer,
- &sample_scale,
- &task.x,
- &task.y,
- &task.w,
- &task.h,
- &task.offset,
- &task.stride};
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(
- &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
- int xthreads = (int)sqrt(threads_per_block);
- int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (task.w + xthreads - 1) / xthreads;
- int yblocks = (task.h + ythreads - 1) / ythreads;
-
- cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(cuFilmConvert,
- xblocks,
- yblocks,
- 1, /* blocks */
- xthreads,
- ythreads,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
- cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
- if (have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuShader;
- CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
- CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
- /* get kernel function */
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
- }
-
- /* do tasks in smaller chunks, so we can cancel it */
- const int shader_chunk_size = 65536;
- const int start = task.shader_x;
- const int end = task.shader_x + task.shader_w;
- int offset = task.offset;
-
- bool canceled = false;
- for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
- for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
- int shader_w = min(shader_chunk_size, end - shader_x);
-
- /* pass in parameters */
- void *args[8];
- int arg = 0;
- args[arg++] = &d_input;
- args[arg++] = &d_output;
- args[arg++] = &task.shader_eval_type;
- if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
- args[arg++] = &task.shader_filter;
- }
- args[arg++] = &shader_x;
- args[arg++] = &shader_w;
- args[arg++] = &offset;
- args[arg++] = &sample;
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(
- &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
- int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuLaunchKernel(cuShader,
- xblocks,
- 1,
- 1, /* blocks */
- threads_per_block,
- 1,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- cuda_assert(cuCtxSynchronize());
-
- if (task.get_cancel()) {
- canceled = true;
- break;
- }
- }
-
- task.update_progress(NULL);
- }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
- if (!background) {
- PixelMem pmem = pixel_mem_map[mem];
- CUdeviceptr buffer;
-
- size_t bytes;
- cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
- cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
- return buffer;
- }
-
- return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
- if (!background) {
- PixelMem pmem = pixel_mem_map[mem];
-
- cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
- }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
- PixelMem pmem;
-
- pmem.w = mem.data_width;
- pmem.h = mem.data_height;
-
- CUDAContextScope scope(this);
-
- glGenBuffers(1, &pmem.cuPBO);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- if (mem.data_type == TYPE_HALF)
- glBufferData(
- GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
- else
- glBufferData(
- GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &pmem.cuTexId);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if (mem.data_type == TYPE_HALF)
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
- else
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
- glBindTexture(GL_TEXTURE_2D, 0);
-
- CUresult result = cuGraphicsGLRegisterBuffer(
- &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
- if (result == CUDA_SUCCESS) {
- mem.device_pointer = pmem.cuTexId;
- pixel_mem_map[mem.device_pointer] = pmem;
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
-
- return;
- }
- else {
- /* failed to register buffer, fallback to no interop */
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- background = true;
- }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
- size_t offset = sizeof(uchar) * 4 * y * w;
- memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
- glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params)
-{
- assert(mem.type == MEM_PIXELS);
-
- if (!background) {
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
- float *vpointer;
-
- CUDAContextScope scope(this);
-
- /* for multi devices, this assumes the inefficient method that we allocate
- * all pixels on the device even though we only render to a subset */
- size_t offset = 4 * y * w;
-
- if (mem.data_type == TYPE_HALF)
- offset *= sizeof(GLhalf);
- else
- offset *= sizeof(uint8_t);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- glActiveTexture(GL_TEXTURE0);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if (mem.data_type == TYPE_HALF) {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
- }
- else {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
- }
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- if (transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if (use_fallback_shader) {
- if (!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if (!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents -
- * avoids stalling if buffer is still waiting in queue to be rendered */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if (vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = (float)w / (float)pmem.w;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = (float)w / (float)pmem.w;
- vpointer[9] = (float)h / (float)pmem.h;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = (float)h / (float)pmem.h;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(
- texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute,
- 2,
- GL_FLOAT,
- GL_FALSE,
- 4 * sizeof(float),
- (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if (use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- if (transparent) {
- glDisable(GL_BLEND);
- }
-
- glBindTexture(GL_TEXTURE_2D, 0);
-
- return;
- }
-
- Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
- CUDAContextScope scope(this);
-
- if (task.type == DeviceTask::RENDER) {
- DeviceRequestedFeatures requested_features;
- if (use_split_kernel()) {
- if (split_kernel == NULL) {
- split_kernel = new CUDASplitKernel(this);
- split_kernel->load_kernels(requested_features);
- }
- }
-
- device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
- /* keep rendering tiles until done */
- RenderTile tile;
- DenoisingTask denoising(this, task);
-
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- if (use_split_kernel()) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(task, tile, void_buffer, void_buffer);
- }
- else {
- render(task, tile, work_tiles);
- }
- }
- else if (tile.task == RenderTile::BAKE) {
- render(task, tile, work_tiles);
- }
- else if (tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoise(tile, denoising);
-
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
-
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- work_tiles.free();
- }
- else if (task.type == DeviceTask::SHADER) {
- shader(task);
-
- cuda_assert(cuCtxSynchronize());
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- DenoisingTask denoising(this, task);
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
- CUDAContextScope scope(this);
-
- /* Load texture info. */
- load_texture_info();
-
- /* Synchronize all memory copies before executing task. */
- cuda_assert(cuCtxSynchronize());
-
- if (task.type == DeviceTask::FILM_CONVERT) {
- /* must be done in main thread due to opengl access */
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- }
- else {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
-}
-
-void CUDADevice::task_wait()
-{
- task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
- task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-# undef cuda_assert
-# define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- if (result != CUDA_SUCCESS) { \
- const char *name = cuewErrorString(result); \
- device->set_error( \
- string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
- } \
- } \
- (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
- cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
- cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
- CUDADevice *device;
- CUfunction func;
-
- public:
- CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
- {
- }
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
- {
- return enqueue(dim, NULL);
- }
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, void *args[])
- {
- if (device->have_error())
- return false;
-
- CUDAContextScope scope(device);
-
- /* we ignore dim.local_size for now, as this is faster */
- int threads_per_block;
- cuda_assert(
- cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
- int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
- threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(func,
- xblocks,
- 1,
- 1, /* blocks */
- threads_per_block,
- 1,
- 1, /* threads */
- 0,
- 0,
- args,
- 0));
-
- return !device->have_error();
- }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
- device_memory & /*data*/,
- size_t num_threads)
-{
- CUDAContextScope scope(device);
-
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
-
- uint threads = num_threads;
- CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
- struct args_t {
- uint *num_threads;
- CUdeviceptr *size;
- };
-
- args_t args = {&threads, &d_size};
-
- CUfunction state_buffer_size;
- cuda_assert(
- cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
- cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
-
- return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory & /*kernel_globals*/,
- device_memory & /*kernel_data*/,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs)
-{
- CUDAContextScope scope(device);
-
- CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
- CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
- CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
- CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
- CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
- CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
- int end_sample = rtile.start_sample + rtile.num_samples;
- int queue_size = dim.global_size[0] * dim.global_size[1];
-
- struct args_t {
- CUdeviceptr *split_data_buffer;
- int *num_elements;
- CUdeviceptr *ray_state;
- int *start_sample;
- int *end_sample;
- int *sx;
- int *sy;
- int *sw;
- int *sh;
- int *offset;
- int *stride;
- CUdeviceptr *queue_index;
- int *queuesize;
- CUdeviceptr *use_queues_flag;
- CUdeviceptr *work_pool_wgs;
- int *num_samples;
- CUdeviceptr *buffer;
- };
-
- args_t args = {&d_split_data,
- &num_global_elements,
- &d_ray_state,
- &rtile.start_sample,
- &end_sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride,
- &d_queue_index,
- &queue_size,
- &d_use_queues_flag,
- &d_work_pool_wgs,
- &rtile.num_samples,
- &d_buffer};
-
- CUfunction data_init;
- cuda_assert(
- cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
- if (device->have_error()) {
- return false;
- }
-
- CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
- return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &)
-{
- const CUDAContextScope scope(device);
-
- CUfunction func;
- const CUresult result = cuModuleGetFunction(
- &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
- if (result != CUDA_SUCCESS) {
- device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
- kernel_name.data(),
- cuewErrorString(result)));
- return NULL;
- }
-
- return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
- return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask & /*task*/)
-{
- CUDAContextScope scope(device);
- size_t free;
- size_t total;
-
- cuda_assert(cuMemGetInfo(&free, &total));
-
- VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
- << " bytes. (" << string_human_readable_size(free) << ").";
-
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
- size_t side = round_down((int)sqrt(num_elements), 32);
- int2 global_size = make_int2(side, round_down(num_elements / side, 16));
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include <climits>
+# include <limits.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+# include "device/cuda/device_impl.h"
+
+# include "render/buffers.h"
+
+# include "util/util_debug.h"
+# include "util/util_foreach.h"
+# include "util/util_logging.h"
+# include "util/util_map.h"
+# include "util/util_md5.h"
+# include "util/util_opengl.h"
+# include "util/util_path.h"
+# include "util/util_string.h"
+# include "util/util_system.h"
+# include "util/util_time.h"
+# include "util/util_types.h"
+# include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+ string cubins_path = path_get("lib");
+ return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+ /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+ return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+ return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+ Device::set_error(error);
+
+ if (first_error) {
+ fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+ fprintf(stderr,
+ "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+ first_error = false;
+ }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+ first_error = true;
+
+ cuDevId = info.num;
+ cuDevice = 0;
+ cuContext = 0;
+
+ cuModule = 0;
+
+ need_texture_info = false;
+
+ device_texture_headroom = 0;
+ device_working_headroom = 0;
+ move_texture_to_host = false;
+ map_host_limit = 0;
+ map_host_used = 0;
+ can_map_host = 0;
+ pitch_alignment = 0;
+
+ /* Initialize CUDA. */
+ CUresult result = cuInit(0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+ return;
+ }
+
+ /* Setup device and context. */
+ result = cuDeviceGet(&cuDevice, cuDevId);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+ cuewErrorString(result)));
+ return;
+ }
+
+ /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+ * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+ * so we can predict which memory to map to host. */
+ cuda_assert(
+ cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+ cuda_assert(cuDeviceGetAttribute(
+ &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+ unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+ if (can_map_host) {
+ ctx_flags |= CU_CTX_MAP_HOST;
+ init_host_memory();
+ }
+
+ /* Create context. */
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+ return;
+ }
+
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+ cuDevArchitecture = major * 100 + minor * 10;
+
+ /* Pop context set by cuCtxCreate. */
+ cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+ texture_info.free();
+
+ cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* We only support sm_30 and above */
+ if (major < 3) {
+ set_error(string_printf(
+ "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+ return false;
+ }
+
+ return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+ if (peer_device == this) {
+ return false;
+ }
+ if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+ return false;
+ }
+
+ CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+ int can_access = 0;
+ cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Ensure array access over the link is possible as well (for 3D textures)
+ cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+ CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+ cuDevice,
+ peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Enable peer access in both directions
+ {
+ const CUDAContextScope scope(this);
+ CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+ cuewErrorString(result)));
+ return false;
+ }
+ }
+ {
+ const CUDAContextScope scope(peer_device_cuda);
+ CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+ if (result != CUDA_SUCCESS) {
+ set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+ cuewErrorString(result)));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+ return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+ const int machine = system_cpu_bits();
+ const string source_path = path_get("source");
+ const string include_path = source_path;
+ string cflags = string_printf(
+ "-m%d "
+ "--ptxas-options=\"-v\" "
+ "--use_fast_math "
+ "-DNVCC "
+ "-I\"%s\"",
+ machine,
+ include_path.c_str());
+ if (use_adaptive_compilation()) {
+ cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ }
+ const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+ if (extra_cflags) {
+ cflags += string(" ") + string(extra_cflags);
+ }
+
+# ifdef WITH_NANOVDB
+ cflags += " -DWITH_NANOVDB";
+# endif
+
+ return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+ const char *name,
+ const char *base,
+ bool force_ptx)
+{
+ /* Compute kernel name. */
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* Attempt to use kernel provided with Blender. */
+ if (!use_adaptive_compilation()) {
+ if (!force_ptx) {
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+ VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using precompiled kernel.";
+ return cubin;
+ }
+ }
+
+ /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+ int ptx_major = major, ptx_minor = minor;
+ while (ptx_major >= 3) {
+ const string ptx = path_get(
+ string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+ VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+ if (path_exists(ptx)) {
+ VLOG(1) << "Using precompiled kernel.";
+ return ptx;
+ }
+
+ if (ptx_minor > 0) {
+ ptx_minor--;
+ }
+ else {
+ ptx_major--;
+ ptx_minor = 9;
+ }
+ }
+ }
+
+ /* Try to use locally compiled kernel. */
+ string source_path = path_get("source");
+ const string source_md5 = path_files_md5_hash(source_path);
+
+ /* We include cflags into md5 so changing cuda toolkit or changing other
+ * compiler command line arguments makes sure cubin gets re-built.
+ */
+ string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+ const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+ const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+ const char *const kernel_arch = force_ptx ? "compute" : "sm";
+ const string cubin_file = string_printf(
+ "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+ const string cubin = path_cache_get(path_join("kernels", cubin_file));
+ VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using locally compiled kernel.";
+ return cubin;
+ }
+
+# ifdef _WIN32
+ if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+ if (major < 3) {
+ set_error(
+ string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+ "Your GPU is not supported.",
+ major,
+ minor));
+ }
+ else {
+ set_error(
+ string_printf("CUDA binary kernel for this graphics card compute "
+ "capability (%d.%d) not found.",
+ major,
+ minor));
+ }
+ return string();
+ }
+# endif
+
+ /* Compile. */
+ const char *const nvcc = cuewCompilerPath();
+ if (nvcc == NULL) {
+ set_error(
+ "CUDA nvcc compiler not found. "
+ "Install CUDA toolkit in default location.");
+ return string();
+ }
+
+ const int nvcc_cuda_version = cuewCompilerVersion();
+ VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+ if (nvcc_cuda_version < 101) {
+ printf(
+ "Unsupported CUDA version %d.%d detected, "
+ "you need CUDA 10.1 or newer.\n",
+ nvcc_cuda_version / 10,
+ nvcc_cuda_version % 10);
+ return string();
+ }
+ else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+ nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+ printf(
+ "CUDA version %d.%d detected, build may succeed but only "
+ "CUDA 10.1 to 11.4 are officially supported.\n",
+ nvcc_cuda_version / 10,
+ nvcc_cuda_version % 10);
+ }
+
+ double starttime = time_dt();
+
+ path_create_directories(cubin);
+
+ source_path = path_join(path_join(source_path, "kernel"),
+ path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+ string command = string_printf(
+ "\"%s\" "
+ "-arch=%s_%d%d "
+ "--%s \"%s\" "
+ "-o \"%s\" "
+ "%s",
+ nvcc,
+ kernel_arch,
+ major,
+ minor,
+ kernel_ext,
+ source_path.c_str(),
+ cubin.c_str(),
+ common_cflags.c_str());
+
+ printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+# ifdef _WIN32
+ command = "call " + command;
+# endif
+ if (system(command.c_str()) != 0) {
+ set_error(
+ "Failed to execute compilation command, "
+ "see console for details.");
+ return string();
+ }
+
+ /* Verify if compilation succeeded */
+ if (!path_exists(cubin)) {
+ set_error(
+ "CUDA kernel compilation failed, "
+ "see console for details.");
+ return string();
+ }
+
+ printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+ return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if (cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
+ /* check if cuda init succeeded */
+ if (cuContext == 0)
+ return false;
+
+ /* check if GPU is supported */
+ if (!support_device(kernel_features))
+ return false;
+
+ /* get kernel */
+ const char *kernel_name = "kernel";
+ string cubin = compile_kernel(kernel_features, kernel_name);
+ if (cubin.empty())
+ return false;
+
+ /* open module */
+ CUDAContextScope scope(this);
+
+ string cubin_data;
+ CUresult result;
+
+ if (path_read_text(cubin, cubin_data))
+ result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if (result != CUDA_SUCCESS)
+ set_error(string_printf(
+ "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+ if (result == CUDA_SUCCESS) {
+ kernels.load(this);
+ reserve_local_memory(kernel_features);
+ }
+
+ return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+ /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+ * needed for kernel launches, so that we can reliably figure out when
+ * to allocate scene data in mapped host memory. */
+ size_t total = 0, free_before = 0, free_after = 0;
+
+ {
+ CUDAContextScope scope(this);
+ cuMemGetInfo(&free_before, &total);
+ }
+
+ {
+ /* Use the biggest kernel for estimation. */
+ const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+ /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+ * multiprocessors. It would be good to do this in parallel for the multi GPU case
+ * still to make it faster. */
+ CUDADeviceQueue queue(this);
+
+ void *d_path_index = nullptr;
+ void *d_render_buffer = nullptr;
+ int d_work_size = 0;
+ void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+ queue.init_execution();
+ queue.enqueue(test_kernel, 1, args);
+ queue.synchronize();
+ }
+
+ {
+ CUDAContextScope scope(this);
+ cuMemGetInfo(&free_after, &total);
+ }
+
+ VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+ << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+# if 0
+ /* For testing mapped host memory, fill up device memory. */
+ const size_t keep_mb = 1024;
+
+ while (free_after > keep_mb * 1024 * 1024LL) {
+ CUdeviceptr tmp;
+ cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+ cuMemGetInfo(&free_after, &total);
+ }
+# endif
+}
+
+void CUDADevice::init_host_memory()
+{
+ /* Limit amount of host mapped memory, because allocating too much can
+ * cause system instability. Leave at least half or 4 GB of system
+ * memory free, whichever is smaller. */
+ size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+ size_t system_ram = system_physical_ram();
+
+ if (system_ram > 0) {
+ if (system_ram / 2 > default_limit) {
+ map_host_limit = system_ram - default_limit;
+ }
+ else {
+ map_host_limit = system_ram / 2;
+ }
+ }
+ else {
+ VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+ map_host_limit = 0;
+ }
+
+ /* Amount of device memory to keep is free after texture memory
+ * and working memory allocations respectively. We set the working
+ * memory limit headroom lower so that some space is left after all
+ * texture memory allocations. */
+ device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+ device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+ VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+ << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+ if (need_texture_info) {
+ /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+ * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+ need_texture_info = false;
+ texture_info.copy_to_device();
+ }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+ /* Break out of recursive call, which can happen when moving memory on a multi device. */
+ static bool any_device_moving_textures_to_host = false;
+ if (any_device_moving_textures_to_host) {
+ return;
+ }
+
+ /* Signal to reallocate textures in host memory only. */
+ move_texture_to_host = true;
+
+ while (size > 0) {
+ /* Find suitable memory allocation to move. */
+ device_memory *max_mem = NULL;
+ size_t max_size = 0;
+ bool max_is_image = false;
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+ device_memory &mem = *pair.first;
+ CUDAMem *cmem = &pair.second;
+
+ /* Can only move textures allocated on this device (and not those from peer devices).
+ * And need to ignore memory that is already on the host. */
+ if (!mem.is_resident(this) || cmem->use_mapped_host) {
+ continue;
+ }
+
+ bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+ (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ /* Can't move this type of memory. */
+ if (!is_texture || cmem->array) {
+ continue;
+ }
+
+ /* For other textures, only move image textures. */
+ if (for_texture && !is_image) {
+ continue;
+ }
+
+ /* Try to move largest allocation, prefer moving images. */
+ if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+ max_is_image = is_image;
+ max_size = mem.device_size;
+ max_mem = &mem;
+ }
+ }
+ lock.unlock();
+
+ /* Move to host memory. This part is mutex protected since
+ * multiple CUDA devices could be moving the memory. The
+ * first one will do it, and the rest will adopt the pointer. */
+ if (max_mem) {
+ VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+ static thread_mutex move_mutex;
+ thread_scoped_lock lock(move_mutex);
+
+ any_device_moving_textures_to_host = true;
+
+ /* Potentially need to call back into multi device, so pointer mapping
+ * and peer devices are updated. This is also necessary since the device
+ * pointer may just be a key here, so cannot be accessed and freed directly.
+ * Unfortunately it does mean that memory is reallocated on all other
+ * devices as well, which is potentially dangerous when still in use (since
+ * a thread rendering on another devices would only be caught in this mutex
+ * if it so happens to do an allocation at the same time as well. */
+ max_mem->device_copy_to();
+ size = (max_size >= size) ? 0 : size - max_size;
+
+ any_device_moving_textures_to_host = false;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+ move_texture_to_host = false;
+
+ /* Update texture info array with new pointers. */
+ load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+ CUDAContextScope scope(this);
+
+ CUdeviceptr device_pointer = 0;
+ size_t size = mem.memory_size() + pitch_padding;
+
+ CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+ const char *status = "";
+
+ /* First try allocating in device memory, respecting headroom. We make
+ * an exception for texture info. It is small and frequently accessed,
+ * so treat it as working memory.
+ *
+ * If there is not enough room for working memory, we will try to move
+ * textures to host memory, assuming the performance impact would have
+ * been worse for working memory. */
+ bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+ size_t total = 0, free = 0;
+ cuMemGetInfo(&free, &total);
+
+ /* Move textures to host memory if needed. */
+ if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+ move_textures_to_host(size + headroom - free, is_texture);
+ cuMemGetInfo(&free, &total);
+ }
+
+ /* Allocate in device memory. */
+ if (!move_texture_to_host && (size + headroom) < free) {
+ mem_alloc_result = cuMemAlloc(&device_pointer, size);
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ status = " in device memory";
+ }
+ }
+
+ /* Fall back to mapped host memory if needed and possible. */
+
+ void *shared_pointer = 0;
+
+ if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+ if (mem.shared_pointer) {
+ /* Another device already allocated host memory. */
+ mem_alloc_result = CUDA_SUCCESS;
+ shared_pointer = mem.shared_pointer;
+ }
+ else if (map_host_used + size < map_host_limit) {
+ /* Allocate host memory ourselves. */
+ mem_alloc_result = cuMemHostAlloc(
+ &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+ assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+ (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+ }
+
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+ map_host_used += size;
+ status = " in host memory";
+ }
+ }
+
+ if (mem_alloc_result != CUDA_SUCCESS) {
+ status = " failed, out of device and host memory";
+ set_error("System is out of GPU and shared host memory");
+ }
+
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")" << status;
+ }
+
+ mem.device_pointer = (device_ptr)device_pointer;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ if (!mem.device_pointer) {
+ return NULL;
+ }
+
+ /* Insert into map of allocations. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ CUDAMem *cmem = &cuda_mem_map[&mem];
+ if (shared_pointer != 0) {
+ /* Replace host pointer with our host allocation. Only works if
+ * CUDA memory layout is the same and has no pitch padding. Also
+ * does not work if we move textures to host during a render,
+ * since other devices might be using the memory. */
+
+ if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+ mem.host_pointer != shared_pointer) {
+ memcpy(shared_pointer, mem.host_pointer, size);
+
+ /* A Call to device_memory::host_free() should be preceded by
+ * a call to device_memory::device_free() for host memory
+ * allocated by a device to be handled properly. Two exceptions
+ * are here and a call in OptiXDevice::generic_alloc(), where
+ * the current host memory can be assumed to be allocated by
+ * device_memory::host_alloc(), not by a device */
+
+ mem.host_free();
+ mem.host_pointer = shared_pointer;
+ }
+ mem.shared_pointer = shared_pointer;
+ mem.shared_counter++;
+ cmem->use_mapped_host = true;
+ }
+ else {
+ cmem->use_mapped_host = false;
+ }
+
+ return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+ if (!mem.host_pointer || !mem.device_pointer) {
+ return;
+ }
+
+ /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+ * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+ * mem.host_pointer. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(
+ cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+ }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ /* If cmem.use_mapped_host is true, reference counting is used
+ * to safely free a mapped host memory. */
+
+ if (cmem.use_mapped_host) {
+ assert(mem.shared_pointer);
+ if (mem.shared_pointer) {
+ assert(mem.shared_counter > 0);
+ if (--mem.shared_counter == 0) {
+ if (mem.host_pointer == mem.shared_pointer) {
+ mem.host_pointer = 0;
+ }
+ cuMemFreeHost(mem.shared_pointer);
+ mem.shared_pointer = 0;
+ }
+ }
+ map_host_used -= mem.device_size;
+ }
+ else {
+ /* Free device memory. */
+ cuda_assert(cuMemFree(mem.device_pointer));
+ }
+
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else if (mem.type == MEM_GLOBAL) {
+ assert(!"mem_alloc not supported for global memory.");
+ }
+ else {
+ generic_alloc(mem);
+ }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ global_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ tex_alloc((device_texture &)mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ generic_alloc(mem);
+ }
+ generic_copy_to(mem);
+ }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+ if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+ assert(!"mem_copy_from not supported for textures.");
+ }
+ else if (mem.host_pointer) {
+ const size_t size = elem * w * h;
+ const size_t offset = elem * y * w;
+
+ if (mem.device_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(cuMemcpyDtoH(
+ (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+ }
+ else {
+ memset((char *)mem.host_pointer + offset, 0, size);
+ }
+ }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+ if (!mem.device_pointer) {
+ return;
+ }
+
+ /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+ * regardless of mem.host_pointer and mem.shared_pointer. */
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+ const CUDAContextScope scope(this);
+ cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+ }
+ else if (mem.host_pointer) {
+ memset(mem.host_pointer, 0, mem.memory_size());
+ }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+ if (mem.type == MEM_GLOBAL) {
+ global_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free((device_texture &)mem);
+ }
+ else {
+ generic_free(mem);
+ }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ CUDAContextScope scope(this);
+ CUdeviceptr mem;
+ size_t bytes;
+
+ cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+ // assert(bytes == size);
+ cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+ if (mem.is_resident(this)) {
+ generic_alloc(mem);
+ generic_copy_to(mem);
+ }
+
+ const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+ if (mem.is_resident(this) && mem.device_pointer) {
+ generic_free(mem);
+ }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+ CUDAContextScope scope(this);
+
+ /* General variables for both architectures */
+ string bind_name = mem.name;
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ switch (mem.info.extension) {
+ case EXTENSION_REPEAT:
+ address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ address_mode = CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ CUfilter_mode filter_mode;
+ if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+ filter_mode = CU_TR_FILTER_MODE_POINT;
+ }
+ else {
+ filter_mode = CU_TR_FILTER_MODE_LINEAR;
+ }
+
+ /* Image Texture Storage */
+ CUarray_format_enum format;
+ switch (mem.data_type) {
+ case TYPE_UCHAR:
+ format = CU_AD_FORMAT_UNSIGNED_INT8;
+ break;
+ case TYPE_UINT16:
+ format = CU_AD_FORMAT_UNSIGNED_INT16;
+ break;
+ case TYPE_UINT:
+ format = CU_AD_FORMAT_UNSIGNED_INT32;
+ break;
+ case TYPE_INT:
+ format = CU_AD_FORMAT_SIGNED_INT32;
+ break;
+ case TYPE_FLOAT:
+ format = CU_AD_FORMAT_FLOAT;
+ break;
+ case TYPE_HALF:
+ format = CU_AD_FORMAT_HALF;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ CUDAMem *cmem = NULL;
+ CUarray array_3d = NULL;
+ size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+ size_t dst_pitch = src_pitch;
+
+ if (!mem.is_resident(this)) {
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+
+ if (mem.data_depth > 1) {
+ array_3d = (CUarray)mem.device_pointer;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ dst_pitch = align_up(src_pitch, pitch_alignment);
+ }
+ }
+ else if (mem.data_depth > 1) {
+ /* 3D texture using array, there is no API for linear memory. */
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+ if (!array_3d) {
+ return;
+ }
+
+ CUDA_MEMCPY3D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = array_3d;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+ param.Depth = mem.data_depth;
+
+ cuda_assert(cuMemcpy3D(&param));
+
+ mem.device_pointer = (device_ptr)array_3d;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ /* 2D texture, using pitch aligned linear memory. */
+ dst_pitch = align_up(src_pitch, pitch_alignment);
+ size_t dst_size = dst_pitch * mem.data_height;
+
+ cmem = generic_alloc(mem, dst_size - mem.memory_size());
+ if (!cmem) {
+ return;
+ }
+
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+ param.dstDevice = mem.device_pointer;
+ param.dstPitch = dst_pitch;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2DUnaligned(&param));
+ }
+ else {
+ /* 1D texture, using linear memory. */
+ cmem = generic_alloc(mem);
+ if (!cmem) {
+ return;
+ }
+
+ cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+ }
+
+ /* Resize once */
+ const uint slot = mem.slot;
+ if (slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(slot + 128);
+ }
+
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ texture_info[slot] = mem.info;
+ need_texture_info = true;
+
+ if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+ mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+ /* Kepler+, bindless textures. */
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(resDesc));
+
+ if (array_3d) {
+ resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ resDesc.res.array.hArray = array_3d;
+ resDesc.flags = 0;
+ }
+ else if (mem.data_height > 0) {
+ resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+ resDesc.res.pitch2D.devPtr = mem.device_pointer;
+ resDesc.res.pitch2D.format = format;
+ resDesc.res.pitch2D.numChannels = mem.data_elements;
+ resDesc.res.pitch2D.height = mem.data_height;
+ resDesc.res.pitch2D.width = mem.data_width;
+ resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+ }
+ else {
+ resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+ resDesc.res.linear.devPtr = mem.device_pointer;
+ resDesc.res.linear.format = format;
+ resDesc.res.linear.numChannels = mem.data_elements;
+ resDesc.res.linear.sizeInBytes = mem.device_size;
+ }
+
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(texDesc));
+ texDesc.addressMode[0] = address_mode;
+ texDesc.addressMode[1] = address_mode;
+ texDesc.addressMode[2] = address_mode;
+ texDesc.filterMode = filter_mode;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ cmem = &cuda_mem_map[&mem];
+
+ cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+ texture_info[slot].data = (uint64_t)cmem->texobject;
+ }
+ else {
+ texture_info[slot].data = (uint64_t)mem.device_pointer;
+ }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ thread_scoped_lock lock(cuda_mem_map_mutex);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ if (cmem.texobject) {
+ /* Free bindless texture. */
+ cuTexObjectDestroy(cmem.texobject);
+ }
+
+ if (!mem.is_resident(this)) {
+ /* Do not free memory here, since it was allocated on a different device. */
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else if (cmem.array) {
+ /* Free array. */
+ cuArrayDestroy(cmem.array);
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else {
+ lock.unlock();
+ generic_free(mem);
+ }
+ }
+}
+
+# if 0
+void CUDADevice::render(DeviceTask &task,
+ RenderTile &rtile,
+ device_vector<KernelWorkTile> &work_tiles)
+{
+ scoped_timer timer(&rtile.buffers->render_time);
+
+ if (have_error())
+ return;
+
+ CUDAContextScope scope(this);
+ CUfunction cuRender;
+
+ /* Get kernel function. */
+ if (rtile.task == RenderTile::BAKE) {
+ cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+ }
+
+ if (have_error()) {
+ return;
+ }
+
+ cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+ /* Allocate work tile. */
+ work_tiles.alloc(1);
+
+ KernelWorkTile *wtile = work_tiles.data();
+ wtile->x = rtile.x;
+ wtile->y = rtile.y;
+ wtile->w = rtile.w;
+ wtile->h = rtile.h;
+ wtile->offset = rtile.offset;
+ wtile->stride = rtile.stride;
+ wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+ /* Prepare work size. More step samples render faster, but for now we
+ * remain conservative for GPUs connected to a display to avoid driver
+ * timeouts and display freezing. */
+ int min_blocks, num_threads_per_block;
+ cuda_assert(
+ cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+ if (!info.display_device) {
+ min_blocks *= 8;
+ }
+
+ uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+ /* Render all samples. */
+ uint start_sample = rtile.start_sample;
+ uint end_sample = rtile.start_sample + rtile.num_samples;
+
+ for (int sample = start_sample; sample < end_sample;) {
+ /* Setup and copy work tile to device. */
+ wtile->start_sample = sample;
+ wtile->num_samples = step_samples;
+ if (task.adaptive_sampling.use) {
+ wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+ }
+ wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+ work_tiles.copy_to_device();
+
+ CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles, &total_work_size};
+
+ cuda_assert(
+ cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+ /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+ uint filter_sample = sample + wtile->num_samples - 1;
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+ adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+ }
+
+ cuda_assert(cuCtxSynchronize());
+
+ /* Update progress. */
+ sample += wtile->num_samples;
+ rtile.sample = sample;
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ /* Finalize adaptive sampling. */
+ if (task.adaptive_sampling.use) {
+ CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+ adaptive_sampling_post(rtile, wtile, d_work_tiles);
+ cuda_assert(cuCtxSynchronize());
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+ }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+ CUDAContextScope scope(this);
+
+ if (task.type == DeviceTask::RENDER) {
+ device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+ /* keep rendering tiles until done */
+ RenderTile tile;
+ DenoisingTask denoising(this, task);
+
+ while (task.acquire_tile(this, tile, task.tile_types)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ render(task, tile, work_tiles);
+ }
+ else if (tile.task == RenderTile::BAKE) {
+ render(task, tile, work_tiles);
+ }
+
+ task.release_tile(tile);
+
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ work_tiles.free();
+ }
+}
+# endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+ return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+ /* Check whether this device is part of OpenGL context.
+ *
+ * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+ * possible, but from the empiric measurements it can be considerably slower than using naive
+ * pixels copy. */
+
+ CUDAContextScope scope(this);
+
+ int num_all_devices = 0;
+ cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+ if (num_all_devices == 0) {
+ return false;
+ }
+
+ vector<CUdevice> gl_devices(num_all_devices);
+ uint num_gl_devices;
+ cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+ for (CUdevice gl_device : gl_devices) {
+ if (gl_device == cuDevice) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+ return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+ return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+ CUDAContextScope scope(this);
+
+ return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+ int value = 0;
+ if (!get_device_attribute(attribute, &value)) {
+ return default_value;
+ }
+ return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/kernel.h"
+# include "device/cuda/queue.h"
+# include "device/cuda/util.h"
+# include "device/device.h"
+
+# include "util/util_map.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include "util/util_opengl.h"
+# include <cuda.h>
+# include <cudaGL.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+ friend class CUDAContextScope;
+
+ public:
+ CUdevice cuDevice;
+ CUcontext cuContext;
+ CUmodule cuModule;
+ size_t device_texture_headroom;
+ size_t device_working_headroom;
+ bool move_texture_to_host;
+ size_t map_host_used;
+ size_t map_host_limit;
+ int can_map_host;
+ int pitch_alignment;
+ int cuDevId;
+ int cuDevArchitecture;
+ bool first_error;
+
+ struct CUDAMem {
+ CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+ {
+ }
+
+ CUtexObject texobject;
+ CUarray array;
+
+ /* If true, a mapped host memory in shared_pointer is being used. */
+ bool use_mapped_host;
+ };
+ typedef map<device_memory *, CUDAMem> CUDAMemMap;
+ CUDAMemMap cuda_mem_map;
+ thread_mutex cuda_mem_map_mutex;
+
+ /* Bindless Textures */
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+
+ CUDADeviceKernels kernels;
+
+ static bool have_precompiled_kernels();
+
+ virtual bool show_samples() const override;
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+ void set_error(const string &error) override;
+
+ CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+ virtual ~CUDADevice();
+
+ bool support_device(const uint /*kernel_features*/);
+
+ bool check_peer_access(Device *peer_device) override;
+
+ bool use_adaptive_compilation();
+
+ virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+ string compile_kernel(const uint kernel_features,
+ const char *name,
+ const char *base = "cuda",
+ bool force_ptx = false);
+
+ virtual bool load_kernels(const uint kernel_features) override;
+
+ void reserve_local_memory(const uint kernel_features);
+
+ void init_host_memory();
+
+ void load_texture_info();
+
+ void move_textures_to_host(size_t size, bool for_texture);
+
+ CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+ void generic_copy_to(device_memory &mem);
+
+ void generic_free(device_memory &mem);
+
+ void mem_alloc(device_memory &mem) override;
+
+ void mem_copy_to(device_memory &mem) override;
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+ void mem_zero(device_memory &mem) override;
+
+ void mem_free(device_memory &mem) override;
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+ virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void global_alloc(device_memory &mem);
+
+ void global_free(device_memory &mem);
+
+ void tex_alloc(device_texture &mem);
+
+ void tex_free(device_texture &mem);
+
+ virtual bool should_use_graphics_interop() override;
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ int get_num_multiprocessors();
+ int get_max_num_threads_per_multiprocessor();
+
+ protected:
+ bool get_device_attribute(CUdevice_attribute attribute, int *value);
+ int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/graphics_interop.h"
+
+# include "device/cuda/device_impl.h"
+# include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+ : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+ CUDAContextScope scope(device_);
+
+ if (cu_graphics_resource_) {
+ cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+ }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+ const DeviceGraphicsInteropDestination &destination)
+{
+ const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+ need_clear_ = destination.need_clear;
+
+ if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+ return;
+ }
+
+ CUDAContextScope scope(device_);
+
+ if (cu_graphics_resource_) {
+ cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+ }
+
+ const CUresult result = cuGraphicsGLRegisterBuffer(
+ &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+ if (result != CUDA_SUCCESS) {
+ LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+ }
+
+ opengl_pbo_id_ = destination.opengl_pbo_id;
+ buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+ if (!cu_graphics_resource_) {
+ return 0;
+ }
+
+ CUDAContextScope scope(device_);
+
+ CUdeviceptr cu_buffer;
+ size_t bytes;
+
+ cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+ cuda_device_assert(
+ device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+ if (need_clear_) {
+ cuda_device_assert(
+ device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+ need_clear_ = false;
+ }
+
+ return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+ CUDAContextScope scope(device_);
+
+ cuda_device_assert(device_,
+ cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/device_graphics_interop.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+ explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+ CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+ CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+ ~CUDADeviceGraphicsInterop();
+
+ CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+ CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+ virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+ virtual device_ptr map() override;
+ virtual void unmap() override;
+
+ protected:
+ CUDADeviceQueue *queue_ = nullptr;
+ CUDADevice *device_ = nullptr;
+
+ /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+ uint opengl_pbo_id_ = 0;
+ /* Buffer area in pixels of the corresponding PBO. */
+ int64_t buffer_area_ = 0;
+
+ /* The destination was requested to be cleared. */
+ bool need_clear_ = false;
+
+ CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..a4a7bfabce0
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/kernel.h"
+# include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+ CUmodule cuModule = device->cuModule;
+
+ for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+ CUDADeviceKernel &kernel = kernels_[i];
+
+ /* No mega-kernel used for GPU. */
+ if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ continue;
+ }
+
+ const std::string function_name = std::string("kernel_gpu_") +
+ device_kernel_as_string((DeviceKernel)i);
+ cuda_device_assert(device,
+ cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+ if (kernel.function) {
+ cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_device_assert(
+ device,
+ cuOccupancyMaxPotentialBlockSize(
+ &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+ }
+ else {
+ LOG(ERROR) << "Unable to load kernel " << function_name;
+ }
+ }
+
+ loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+ return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# include "device/device_kernel.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+ CUfunction function = nullptr;
+
+ int num_threads_per_block = 0;
+ int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+ void load(CUDADevice *device);
+ const CUDADeviceKernel &get(DeviceKernel kernel) const;
+ bool available(DeviceKernel kernel) const;
+
+ protected:
+ CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+ bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/queue.h"
+
+# include "device/cuda/device_impl.h"
+# include "device/cuda/graphics_interop.h"
+# include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+ : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+ const CUDAContextScope scope(cuda_device_);
+ cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+ int num_states = max(cuda_device_->get_num_multiprocessors() *
+ cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+ 1048576);
+
+ const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+ if (factor_str) {
+ num_states = max((int)(num_states * atof(factor_str)), 1024);
+ }
+
+ VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+ << string_human_readable_size(num_states * state_size);
+
+ return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+ const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+ cuda_device_->get_max_num_threads_per_multiprocessor();
+
+ if (max_num_threads == 0) {
+ return 65536;
+ }
+
+ return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+ /* Synchronize all textures and memory copies before executing task. */
+ CUDAContextScope scope(cuda_device_);
+ cuda_device_->load_texture_info();
+ cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+ debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+ return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ debug_enqueue(kernel, work_size);
+
+ const CUDAContextScope scope(cuda_device_);
+ const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+ /* Compute kernel launch parameters. */
+ const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+ const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+ int shared_mem_bytes = 0;
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+ /* See parall_active_index.h for why this amount of shared memory is needed. */
+ shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+ break;
+
+ default:
+ break;
+ }
+
+ /* Launch kernel. */
+ cuda_device_assert(cuda_device_,
+ cuLaunchKernel(cuda_kernel.function,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ shared_mem_bytes,
+ cuda_stream_,
+ args,
+ 0));
+
+ return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+ debug_synchronize();
+
+ return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ cuda_device_->mem_alloc(mem);
+ }
+
+ /* Zero memory on device. */
+ assert(mem.device_pointer != 0);
+
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ /* Allocate on demand. */
+ if (mem.device_pointer == 0) {
+ cuda_device_->mem_alloc(mem);
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ /* Copy memory to device. */
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(
+ (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+ assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+ if (mem.memory_size() == 0) {
+ return;
+ }
+
+ assert(mem.device_pointer != 0);
+ assert(mem.host_pointer != nullptr);
+
+ /* Copy memory from device. */
+ const CUDAContextScope scope(cuda_device_);
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyDtoHAsync(
+ mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+ return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# include "device/device_kernel.h"
+# include "device/device_memory.h"
+# include "device/device_queue.h"
+
+# include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+ CUDADeviceQueue(CUDADevice *device);
+ ~CUDADeviceQueue();
+
+ virtual int num_concurrent_states(const size_t state_size) const override;
+ virtual int num_concurrent_busy_states() const override;
+
+ virtual void init_execution() override;
+
+ virtual bool kernel_available(DeviceKernel kernel) const override;
+
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+ virtual bool synchronize() override;
+
+ virtual void zero_to_device(device_memory &mem) override;
+ virtual void copy_to_device(device_memory &mem) override;
+ virtual void copy_from_device(device_memory &mem) override;
+
+ virtual CUstream stream()
+ {
+ return cuda_stream_;
+ }
+
+ virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+ CUDADevice *cuda_device_;
+ CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+# include "device/cuda/util.h"
+# include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+ cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+ cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+# ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+ /* We can only give error code here without major code duplication, that
+ * should be enough since dynamic loading is only being disabled by folks
+ * who knows what they're doing anyway.
+ *
+ * NOTE: Avoid call from several threads.
+ */
+ static string error;
+ error = string_printf("%d", result);
+ return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+ return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+ return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+# endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+# ifdef WITH_CUDA_DYNLOAD
+# include "cuew.h"
+# else
+# include <cuda.h>
+# endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+ CUDAContextScope(CUDADevice *device);
+ ~CUDAContextScope();
+
+ private:
+ CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+# define cuda_device_assert(cuda_device, stmt) \
+ { \
+ CUresult result = stmt; \
+ if (result != CUDA_SUCCESS) { \
+ const char *name = cuewErrorString(result); \
+ cuda_device->set_error( \
+ string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+ } \
+ } \
+ (void)0
+
+# define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+# ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+# endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
#include "bvh/bvh2.h"
#include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
#include "util/util_foreach.h"
#include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
bool Device::need_types_update = true;
bool Device::need_devices_update = true;
thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
vector<DeviceInfo> Device::cuda_devices;
vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
uint Device::devices_initialized_mask = 0;
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
- os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
- os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
- /* TODO(sergey): Decode bitflag into list of names. */
- os << "Nodes features: " << requested_features.nodes_features << std::endl;
- os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
- os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
- << std::endl;
- os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
- << std::endl;
- os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
- os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
- os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
- os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
- << std::endl;
- os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
- << std::endl;
- os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
- << std::endl;
- os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
- << std::endl;
- os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
- os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
- << std::endl;
- os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
- << std::endl;
- return os;
-}
-
/* Device */
Device::~Device() noexcept(false)
{
- if (!background) {
- if (vertex_buffer != 0) {
- glDeleteBuffers(1, &vertex_buffer);
- }
- if (fallback_shader_program != 0) {
- glDeleteProgram(fallback_shader_program);
- }
- }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
- "#version 330\n"
- "uniform vec2 fullscreen;\n"
- "in vec2 texCoord;\n"
- "in vec2 pos;\n"
- "out vec2 texCoord_interp;\n"
- "\n"
- "vec2 normalize_coordinates()\n"
- "{\n"
- " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
- "}\n"
- "\n"
- "void main()\n"
- "{\n"
- " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
- " texCoord_interp = texCoord;\n"
- "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
- "#version 330\n"
- "uniform sampler2D image_texture;\n"
- "in vec2 texCoord_interp;\n"
- "out vec4 fragColor;\n"
- "\n"
- "void main()\n"
- "{\n"
- " fragColor = texture(image_texture, texCoord_interp);\n"
- "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
- LOG(ERROR) << "Shader: " << task << " error:";
- LOG(ERROR) << "===== shader string ====";
-
- stringstream stream(code);
- string partial;
-
- int line = 1;
- while (getline(stream, partial, '\n')) {
- if (line < 10) {
- LOG(ERROR) << " " << line << " " << partial;
- }
- else {
- LOG(ERROR) << line << " " << partial;
- }
- line++;
- }
- LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
- GLint status;
- GLchar log[5000];
- GLsizei length = 0;
- GLuint program = 0;
-
- struct Shader {
- const char *source;
- GLenum type;
- } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
- {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
- program = glCreateProgram();
-
- for (int i = 0; i < 2; i++) {
- GLuint shader = glCreateShader(shaders[i].type);
-
- string source_str = shaders[i].source;
- const char *c_str = source_str.c_str();
-
- glShaderSource(shader, 1, &c_str, NULL);
- glCompileShader(shader);
-
- glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
- if (!status) {
- glGetShaderInfoLog(shader, sizeof(log), &length, log);
- shader_print_errors("compile", log, c_str);
- return 0;
- }
-
- glAttachShader(program, shader);
- }
-
- /* Link output. */
- glBindFragDataLocation(program, 0, "fragColor");
-
- /* Link and error check. */
- glLinkProgram(program);
-
- glGetProgramiv(program, GL_LINK_STATUS, &status);
- if (!status) {
- glGetShaderInfoLog(program, sizeof(log), &length, log);
- shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
- shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
- return 0;
- }
-
- return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
- if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
- return false;
- }
-
- if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
- fallback_shader_program = bind_fallback_shader();
- fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
- if (fallback_shader_program == 0) {
- return false;
- }
-
- glUseProgram(fallback_shader_program);
- image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
- if (image_texture_location < 0) {
- LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
- return false;
- }
-
- fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
- if (fullscreen_location < 0) {
- LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
- return false;
- }
-
- fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
- }
-
- /* Run this every time. */
- glUseProgram(fallback_shader_program);
- glUniform1i(image_texture_location, 0);
- glUniform2f(fullscreen_location, width, height);
- return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params)
-{
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
- assert(rgba.type == MEM_PIXELS);
- mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
- GLuint texid;
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &texid);
- glBindTexture(GL_TEXTURE_2D, texid);
-
- if (rgba.data_type == TYPE_HALF) {
- GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
- }
- else {
- uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
- }
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
- if (transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if (use_fallback_shader) {
- if (!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if (!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
- */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if (vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = 1.0f;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = 1.0f;
- vpointer[9] = 1.0f;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = 1.0f;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- if (vertex_buffer) {
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(
- texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute,
- 2,
- GL_FLOAT,
- GL_FALSE,
- 4 * sizeof(float),
- (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if (vertex_buffer) {
- glBindBuffer(GL_ARRAY_BUFFER, 0);
- }
-
- if (use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- glDeleteVertexArrays(1, &vertex_array_object);
- glBindTexture(GL_TEXTURE_2D, 0);
- glDeleteTextures(1, &texid);
-
- if (transparent) {
- glDisable(GL_BLEND);
- }
}
void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
}
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
#ifdef WITH_MULTI
if (!info.multi_devices.empty()) {
/* Always create a multi device when info contains multiple devices.
* This is done so that the type can still be e.g. DEVICE_CPU to indicate
* that it is a homogeneous collection of devices, which simplifies checks. */
- return device_multi_create(info, stats, profiler, background);
+ return device_multi_create(info, stats, profiler);
}
#endif
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
switch (info.type) {
case DEVICE_CPU:
- device = device_cpu_create(info, stats, profiler, background);
+ device = device_cpu_create(info, stats, profiler);
break;
#ifdef WITH_CUDA
case DEVICE_CUDA:
if (device_cuda_init())
- device = device_cuda_create(info, stats, profiler, background);
+ device = device_cuda_create(info, stats, profiler);
break;
#endif
#ifdef WITH_OPTIX
case DEVICE_OPTIX:
if (device_optix_init())
- device = device_optix_create(info, stats, profiler, background);
- break;
-#endif
-#ifdef WITH_NETWORK
- case DEVICE_NETWORK:
- device = device_network_create(info, stats, profiler, "127.0.0.1");
- break;
-#endif
-#ifdef WITH_OPENCL
- case DEVICE_OPENCL:
- if (device_opencl_init())
- device = device_opencl_create(info, stats, profiler, background);
+ device = device_optix_create(info, stats, profiler);
break;
#endif
default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
}
if (device == NULL) {
- device = device_dummy_create(info, stats, profiler, background);
+ device = device_dummy_create(info, stats, profiler);
}
return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_CUDA;
else if (strcmp(name, "OPTIX") == 0)
return DEVICE_OPTIX;
- else if (strcmp(name, "OPENCL") == 0)
- return DEVICE_OPENCL;
- else if (strcmp(name, "NETWORK") == 0)
- return DEVICE_NETWORK;
else if (strcmp(name, "MULTI") == 0)
return DEVICE_MULTI;
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
return "CUDA";
else if (type == DEVICE_OPTIX)
return "OPTIX";
- else if (type == DEVICE_OPENCL)
- return "OPENCL";
- else if (type == DEVICE_NETWORK)
- return "NETWORK";
else if (type == DEVICE_MULTI)
return "MULTI";
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_OPTIX
types.push_back(DEVICE_OPTIX);
#endif
-#ifdef WITH_OPENCL
- types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
- types.push_back(DEVICE_NETWORK);
-#endif
return types;
}
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
thread_scoped_lock lock(device_mutex);
vector<DeviceInfo> devices;
-#ifdef WITH_OPENCL
- if (mask & DEVICE_MASK_OPENCL) {
- if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
- if (device_opencl_init()) {
- device_opencl_info(opencl_devices);
- }
- devices_initialized_mask |= DEVICE_MASK_OPENCL;
- }
- foreach (DeviceInfo &info, opencl_devices) {
- devices.push_back(info);
- }
- }
-#endif
-
#if defined(WITH_CUDA) || defined(WITH_OPTIX)
if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
}
-#ifdef WITH_NETWORK
- if (mask & DEVICE_MASK_NETWORK) {
- if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
- device_network_info(network_devices);
- devices_initialized_mask |= DEVICE_MASK_NETWORK;
- }
- foreach (DeviceInfo &info, network_devices) {
- devices.push_back(info);
- }
- }
-#endif
-
return devices;
}
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
capabilities += device_cpu_capabilities() + "\n";
}
-#ifdef WITH_OPENCL
- if (mask & DEVICE_MASK_OPENCL) {
- if (device_opencl_init()) {
- capabilities += "\nOpenCL device capabilities:\n";
- capabilities += device_opencl_capabilities();
- }
- }
-#endif
-
#ifdef WITH_CUDA
if (mask & DEVICE_MASK_CUDA) {
if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
}
DeviceInfo info;
- info.type = subdevices.front().type;
+ info.type = DEVICE_NONE;
info.id = "MULTI";
info.description = "Multi Device";
info.num = 0;
info.has_half_images = true;
info.has_nanovdb = true;
- info.has_volume_decoupled = true;
- info.has_branched_path = true;
- info.has_adaptive_stop_per_sample = true;
info.has_osl = true;
info.has_profiling = true;
info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.id += device.id;
/* Set device type to MULTI if subdevices are not of a common type. */
- if (device.type != info.type) {
+ if (info.type == DEVICE_NONE) {
+ info.type = device.type;
+ }
+ else if (device.type != info.type) {
info.type = DEVICE_MULTI;
}
/* Accumulate device info. */
info.has_half_images &= device.has_half_images;
info.has_nanovdb &= device.has_nanovdb;
- info.has_volume_decoupled &= device.has_volume_decoupled;
- info.has_branched_path &= device.has_branched_path;
- info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
info.has_osl &= device.has_osl;
info.has_profiling &= device.has_profiling;
info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
devices_initialized_mask = 0;
cuda_devices.free_memory();
optix_devices.free_memory();
- opencl_devices.free_memory();
cpu_devices.free_memory();
- network_devices.free_memory();
}
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
{
- assert(denoising_devices.empty());
-
- if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
- vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
- if (!optix_devices.empty()) {
- /* Convert to a special multi device with separate denoising devices. */
- if (multi_devices.empty()) {
- multi_devices.push_back(*this);
- }
-
- /* Try to use the same physical devices for denoising. */
- for (const DeviceInfo &cuda_device : multi_devices) {
- if (cuda_device.type == DEVICE_CUDA) {
- for (const DeviceInfo &optix_device : optix_devices) {
- if (cuda_device.num == optix_device.num) {
- id += optix_device.id;
- denoising_devices.push_back(optix_device);
- break;
- }
- }
- }
- }
-
- if (denoising_devices.empty()) {
- /* Simply use the first available OptiX device. */
- const DeviceInfo optix_device = optix_devices.front();
- id += optix_device.id; /* Uniquely identify this special multi device. */
- denoising_devices.push_back(optix_device);
- }
+ LOG(FATAL) << "Device does not support queues.";
+ return nullptr;
+}
- denoisers = denoiser_type;
- }
- }
- else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
- /* Convert to a special multi device with separate denoising devices. */
- if (multi_devices.empty()) {
- multi_devices.push_back(*this);
- }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+ LOG(FATAL) << "Device does not support CPU kernels.";
+ return nullptr;
+}
- /* Add CPU denoising devices. */
- DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
- denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+ LOG(FATAL) << "Device does not support CPU kernels.";
+}
- denoisers = denoiser_type;
- }
+void *Device::get_cpu_osl_memory()
+{
+ return nullptr;
}
+/* DeviceInfo */
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..399d5eb91df 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
#include "bvh/bvh_params.h"
+#include "device/device_denoise.h"
#include "device/device_memory.h"
-#include "device/device_task.h"
+#include "util/util_function.h"
#include "util/util_list.h"
+#include "util/util_logging.h"
#include "util/util_stats.h"
#include "util/util_string.h"
#include "util/util_texture.h"
#include "util/util_thread.h"
#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
class BVH;
+class DeviceQueue;
class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
/* Device Types */
enum DeviceType {
DEVICE_NONE = 0,
DEVICE_CPU,
- DEVICE_OPENCL,
DEVICE_CUDA,
- DEVICE_NETWORK,
DEVICE_MULTI,
DEVICE_OPTIX,
DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
enum DeviceTypeMask {
DEVICE_MASK_CPU = (1 << DEVICE_CPU),
- DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
- DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
DEVICE_MASK_ALL = ~0
};
-enum DeviceKernelStatus {
- DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
- DEVICE_KERNEL_USING_FEATURE_KERNEL,
- DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
- DEVICE_KERNEL_UNKNOWN,
-};
-
#define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
string description;
string id; /* used for user preferences, should stay fixed with changing hardware config */
int num;
- bool display_device; /* GPU is used as a display device. */
- bool has_half_images; /* Support half-float textures. */
- bool has_nanovdb; /* Support NanoVDB volumes. */
- bool has_volume_decoupled; /* Decoupled volume shading. */
- bool has_branched_path; /* Supports branched path tracing. */
- bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
- bool has_osl; /* Support Open Shading Language. */
- bool use_split_kernel; /* Use split or mega kernel. */
- bool has_profiling; /* Supports runtime collection of profiling info. */
- bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
- DenoiserTypeMask denoisers; /* Supported denoiser types. */
+ bool display_device; /* GPU is used as a display device. */
+ bool has_nanovdb; /* Support NanoVDB volumes. */
+ bool has_half_images; /* Support half-float textures. */
+ bool has_osl; /* Support Open Shading Language. */
+ bool has_profiling; /* Supports runtime collection of profiling info. */
+ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
+ bool has_gpu_queue; /* Device supports GPU queue. */
+ DenoiserTypeMask denoisers; /* Supported denoiser types. */
int cpu_threads;
vector<DeviceInfo> multi_devices;
- vector<DeviceInfo> denoising_devices;
string error_msg;
DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
display_device = false;
has_half_images = false;
has_nanovdb = false;
- has_volume_decoupled = false;
- has_branched_path = true;
- has_adaptive_stop_per_sample = false;
has_osl = false;
- use_split_kernel = false;
has_profiling = false;
has_peer_memory = false;
+ has_gpu_queue = false;
denoisers = DENOISER_NONE;
}
- bool operator==(const DeviceInfo &info)
+ bool operator==(const DeviceInfo &info) const
{
/* Multiple Devices with the same ID would be very bad. */
assert(id != info.id ||
(type == info.type && num == info.num && description == info.description));
return id == info.id;
}
-
- /* Add additional devices needed for the specified denoiser. */
- void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
- /* Use experimental feature set. */
- bool experimental;
-
- /* Selective nodes compilation. */
-
- /* Identifier of a node group up to which all the nodes needs to be
- * compiled in. Nodes from higher group indices will be ignores.
- */
- int max_nodes_group;
-
- /* Features bitfield indicating which features from the requested group
- * will be compiled in. Nodes which corresponds to features which are not
- * in this bitfield will be ignored even if they're in the requested group.
- */
- int nodes_features;
-
- /* BVH/sampling kernel features. */
- bool use_hair;
- bool use_hair_thick;
- bool use_object_motion;
- bool use_camera_motion;
-
- /* Denotes whether baking functionality is needed. */
- bool use_baking;
-
- /* Use subsurface scattering materials. */
- bool use_subsurface;
-
- /* Use volume materials. */
- bool use_volume;
-
- /* Use branched integrator. */
- bool use_integrator_branched;
-
- /* Use OpenSubdiv patch evaluation */
- bool use_patch_evaluation;
-
- /* Use Transparent shadows */
- bool use_transparent;
-
- /* Use various shadow tricks, such as shadow catcher. */
- bool use_shadow_tricks;
-
- /* Per-uber shader usage flags. */
- bool use_principled;
-
- /* Denoising features. */
- bool use_denoising;
-
- /* Use raytracing in shaders. */
- bool use_shader_raytrace;
-
- /* Use true displacement */
- bool use_true_displacement;
-
- /* Use background lights */
- bool use_background_light;
-
- DeviceRequestedFeatures()
- {
- /* TODO(sergey): Find more meaningful defaults. */
- max_nodes_group = 0;
- nodes_features = 0;
- use_hair = false;
- use_hair_thick = false;
- use_object_motion = false;
- use_camera_motion = false;
- use_baking = false;
- use_subsurface = false;
- use_volume = false;
- use_integrator_branched = false;
- use_patch_evaluation = false;
- use_transparent = false;
- use_shadow_tricks = false;
- use_principled = false;
- use_denoising = false;
- use_shader_raytrace = false;
- use_true_displacement = false;
- use_background_light = false;
- }
-
- bool modified(const DeviceRequestedFeatures &requested_features)
- {
- return !(max_nodes_group == requested_features.max_nodes_group &&
- nodes_features == requested_features.nodes_features &&
- use_hair == requested_features.use_hair &&
- use_hair_thick == requested_features.use_hair_thick &&
- use_object_motion == requested_features.use_object_motion &&
- use_camera_motion == requested_features.use_camera_motion &&
- use_baking == requested_features.use_baking &&
- use_subsurface == requested_features.use_subsurface &&
- use_volume == requested_features.use_volume &&
- use_integrator_branched == requested_features.use_integrator_branched &&
- use_patch_evaluation == requested_features.use_patch_evaluation &&
- use_transparent == requested_features.use_transparent &&
- use_shadow_tricks == requested_features.use_shadow_tricks &&
- use_principled == requested_features.use_principled &&
- use_denoising == requested_features.use_denoising &&
- use_shader_raytrace == requested_features.use_shader_raytrace &&
- use_true_displacement == requested_features.use_true_displacement &&
- use_background_light == requested_features.use_background_light);
- }
-
- /* Convert the requested features structure to a build options,
- * which could then be passed to compilers.
- */
- string get_build_options() const
- {
- string build_options = "";
- if (experimental) {
- build_options += "-D__KERNEL_EXPERIMENTAL__ ";
- }
- build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
- build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
- if (!use_hair) {
- build_options += " -D__NO_HAIR__";
- }
- if (!use_object_motion) {
- build_options += " -D__NO_OBJECT_MOTION__";
- }
- if (!use_camera_motion) {
- build_options += " -D__NO_CAMERA_MOTION__";
- }
- if (!use_baking) {
- build_options += " -D__NO_BAKING__";
- }
- if (!use_volume) {
- build_options += " -D__NO_VOLUME__";
- }
- if (!use_subsurface) {
- build_options += " -D__NO_SUBSURFACE__";
- }
- if (!use_integrator_branched) {
- build_options += " -D__NO_BRANCHED_PATH__";
- }
- if (!use_patch_evaluation) {
- build_options += " -D__NO_PATCH_EVAL__";
- }
- if (!use_transparent && !use_volume) {
- build_options += " -D__NO_TRANSPARENT__";
- }
- if (!use_shadow_tricks) {
- build_options += " -D__NO_SHADOW_TRICKS__";
- }
- if (!use_principled) {
- build_options += " -D__NO_PRINCIPLED__";
- }
- if (!use_denoising) {
- build_options += " -D__NO_DENOISING__";
- }
- if (!use_shader_raytrace) {
- build_options += " -D__NO_SHADER_RAYTRACE__";
- }
- return build_options;
- }
};
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
/* Device */
-struct DeviceDrawParams {
- function<void()> bind_display_space_shader_cb;
- function<void()> unbind_display_space_shader_cb;
-};
-
class Device {
friend class device_sub_ptr;
protected:
- enum {
- FALLBACK_SHADER_STATUS_NONE = 0,
- FALLBACK_SHADER_STATUS_ERROR,
- FALLBACK_SHADER_STATUS_SUCCESS,
- };
-
- Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
- : background(background),
- vertex_buffer(0),
- fallback_status(FALLBACK_SHADER_STATUS_NONE),
- fallback_shader_program(0),
- info(info_),
- stats(stats_),
- profiler(profiler_)
+ Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : info(info_), stats(stats_), profiler(profiler_)
{
}
- bool background;
string error_msg;
- /* used for real time display */
- unsigned int vertex_buffer;
- int fallback_status, fallback_shader_program;
- int image_texture_location, fullscreen_location;
-
- bool bind_fallback_display_space_shader(const float width, const float height);
-
virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
{
/* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
Stats &stats;
Profiler &profiler;
- /* memory alignment */
- virtual int mem_sub_ptr_alignment()
- {
- return MIN_ALIGNMENT_CPU_DATA_TYPES;
- }
-
/* constant memory */
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
- /* open shading language, only for CPU device */
- virtual void *osl_memory()
- {
- return NULL;
- }
-
/* load/compile kernels, must be called before adding tasks */
- virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+ virtual bool load_kernels(uint /*kernel_features*/)
{
return true;
}
- /* Wait for device to become available to upload data and receive tasks
- * This method is used by the OpenCL device to load the
- * optimized kernels or when not (yet) available load the
- * generic kernels (only during foreground rendering) */
- virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
- {
- return true;
- }
- /* Check if there are 'better' kernels available to be used
- * We can switch over to these kernels
- * This method is used to determine if we can switch the preview kernels
- * to regular kernels */
- virtual DeviceKernelStatus get_active_kernel_switch_state()
- {
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
- }
+ /* GPU device only functions.
+ * These may not be used on CPU or multi-devices. */
- /* tasks */
- virtual int get_split_task_count(DeviceTask &)
- {
- return 1;
- }
+ /* Create new queue for executing kernels in. */
+ virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+ /* CPU device only functions.
+ * These may not be used on GPU or multi-devices. */
- virtual void task_add(DeviceTask &task) = 0;
- virtual void task_wait() = 0;
- virtual void task_cancel() = 0;
-
- /* opengl drawing */
- virtual void draw_pixels(device_memory &mem,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params);
+ /* Get CPU kernel functions for native instruction set. */
+ virtual const CPUKernels *get_cpu_kernels() const;
+ /* Get kernel globals to pass to kernels. */
+ virtual void get_cpu_kernel_thread_globals(
+ vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+ /* Get OpenShadingLanguage memory buffer. */
+ virtual void *get_cpu_osl_memory();
/* acceleration structure building */
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
/* OptiX specific destructor. */
virtual void release_optix_bvh(BVH * /*bvh*/){};
-#ifdef WITH_NETWORK
- /* networking */
- void server_run();
-#endif
-
/* multi device */
- virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
- {
- }
virtual int device_number(Device * /*sub_device*/)
{
return 0;
}
- virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
- {
- }
- virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
- {
- }
virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
{
@@ -460,11 +208,47 @@ class Device {
return false;
}
+ /* Graphics resources interoperability.
+ *
+ * The interoperability comes here by the meaning that the device is capable of computing result
+ * directly into an OpenGL (or other graphics library) buffer. */
+
+ /* Check display is to be updated using graphics interoperability.
+ * The interoperability can not be used is it is not supported by the device. But the device
+ * might also force disable the interoperability if it detects that it will be slower than
+ * copying pixels from the render buffer. */
+ virtual bool should_use_graphics_interop()
+ {
+ return false;
+ }
+
+ /* Buffer denoising. */
+
+ /* Returns true if task is fully handled. */
+ virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+ {
+ LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+ return false;
+ }
+
+ virtual DeviceQueue *get_denoise_queue()
+ {
+ LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+ return nullptr;
+ }
+
+ /* Sub-devices */
+
+ /* Run given callback for every individual device which will be handling rendering.
+ * For the single device the callback is called for the device itself. For the multi-device the
+ * callback is only called for the sub-devices. */
+ virtual void foreach_device(const function<void(Device *)> &callback)
+ {
+ callback(this);
+ }
+
/* static */
- static Device *create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background = true);
+ static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
static DeviceType type_from_string(const char *name);
static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
static thread_mutex device_mutex;
static vector<DeviceInfo> cuda_devices;
static vector<DeviceInfo> optix_devices;
- static vector<DeviceInfo> opencl_devices;
static vector<DeviceInfo> cpu_devices;
- static vector<DeviceInfo> network_devices;
static uint devices_initialized_mask;
};
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-# include "util/util_windows.h"
-# include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-# include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
- KernelFunctions()
- {
- kernel = (F)NULL;
- }
-
- KernelFunctions(
- F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
- {
- const char *architecture_name = "default";
- kernel = kernel_default;
-
- /* Silence potential warnings about unused variables
- * when compiling without some architectures. */
- (void)kernel_sse2;
- (void)kernel_sse3;
- (void)kernel_sse41;
- (void)kernel_avx;
- (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
- architecture_name = "AVX2";
- kernel = kernel_avx2;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
- architecture_name = "AVX";
- kernel = kernel_avx;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
- architecture_name = "SSE4.1";
- kernel = kernel_sse41;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
- architecture_name = "SSE3";
- kernel = kernel_sse3;
- }
- else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
- architecture_name = "SSE2";
- kernel = kernel_sse2;
- }
-#else
- {
- /* Dummy to prevent the architecture if below become
- * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- * is not defined. */
- }
-#endif
-
- if (strcmp(architecture_name, logged_architecture) != 0) {
- VLOG(1) << "Will be using " << architecture_name << " kernels.";
- logged_architecture = architecture_name;
- }
- }
-
- inline F operator()() const
- {
- assert(kernel);
- return kernel;
- }
-
- protected:
- F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-
- public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs);
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
- TaskPool task_pool;
- KernelGlobals kernel_globals;
-
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
-
-#ifdef WITH_OSL
- OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
- oidn::DeviceRef oidn_device;
- oidn::FilterRef oidn_filter;
-#endif
- thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
- RTCScene embree_scene = NULL;
- RTCDevice embree_device;
-#endif
-
- bool use_split_kernel;
-
- DeviceRequestedFeatures requested_features;
-
- KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
- convert_to_half_float_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
- convert_to_byte_kernel;
- KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
- shader_kernel;
- KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
- KernelFunctions<void (*)(
- int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
- filter_divide_shadow_kernel;
- KernelFunctions<void (*)(
- int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
- filter_get_feature_kernel;
- KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
- filter_write_feature_kernel;
- KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
- filter_detect_outliers_kernel;
- KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
- filter_combine_halves_kernel;
-
- KernelFunctions<void (*)(
- int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
- filter_nlm_calc_difference_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void (*)(
- int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
- filter_nlm_update_output_kernel;
- KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
- KernelFunctions<void (*)(
- float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
- filter_construct_transform_kernel;
- KernelFunctions<void (*)(int,
- int,
- int,
- float *,
- float *,
- float *,
- int *,
- float *,
- float3 *,
- int *,
- int *,
- int,
- int,
- int,
- int,
- bool)>
- filter_nlm_construct_gramian_kernel;
- KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
- filter_finalize_kernel;
-
- KernelFunctions<void (*)(KernelGlobals *,
- ccl_constant KernelData *,
- ccl_global void *,
- int,
- ccl_global char *,
- int,
- int,
- int,
- int,
- int,
- int,
- int,
- int,
- ccl_global int *,
- int,
- ccl_global char *,
- ccl_global unsigned int *,
- unsigned int,
- ccl_global float *)>
- data_init_kernel;
- unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
- KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
- KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
- KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
- CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : Device(info_, stats_, profiler_, background_),
- texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
- REGISTER_KERNEL(path_trace),
- REGISTER_KERNEL(convert_to_half_float),
- REGISTER_KERNEL(convert_to_byte),
- REGISTER_KERNEL(shader),
- REGISTER_KERNEL(bake),
- REGISTER_KERNEL(filter_divide_shadow),
- REGISTER_KERNEL(filter_get_feature),
- REGISTER_KERNEL(filter_write_feature),
- REGISTER_KERNEL(filter_detect_outliers),
- REGISTER_KERNEL(filter_combine_halves),
- REGISTER_KERNEL(filter_nlm_calc_difference),
- REGISTER_KERNEL(filter_nlm_blur),
- REGISTER_KERNEL(filter_nlm_calc_weight),
- REGISTER_KERNEL(filter_nlm_update_output),
- REGISTER_KERNEL(filter_nlm_normalize),
- REGISTER_KERNEL(filter_construct_transform),
- REGISTER_KERNEL(filter_nlm_construct_gramian),
- REGISTER_KERNEL(filter_finalize),
- REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
- {
- if (info.cpu_threads == 0) {
- info.cpu_threads = TaskScheduler::num_threads();
- }
-
-#ifdef WITH_OSL
- kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
- embree_device = rtcNewDevice("verbose=0");
-#endif
- use_split_kernel = DebugFlags().cpu.split_kernel;
- if (use_split_kernel) {
- VLOG(1) << "Will be using split kernel.";
- }
- need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
- split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
- KERNEL_FUNCTIONS(name))
- REGISTER_SPLIT_KERNEL(path_init);
- REGISTER_SPLIT_KERNEL(scene_intersect);
- REGISTER_SPLIT_KERNEL(lamp_emission);
- REGISTER_SPLIT_KERNEL(do_volume);
- REGISTER_SPLIT_KERNEL(queue_enqueue);
- REGISTER_SPLIT_KERNEL(indirect_background);
- REGISTER_SPLIT_KERNEL(shader_setup);
- REGISTER_SPLIT_KERNEL(shader_sort);
- REGISTER_SPLIT_KERNEL(shader_eval);
- REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
- REGISTER_SPLIT_KERNEL(subsurface_scatter);
- REGISTER_SPLIT_KERNEL(direct_lighting);
- REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
- REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
- REGISTER_SPLIT_KERNEL(enqueue_inactive);
- REGISTER_SPLIT_KERNEL(next_iteration_setup);
- REGISTER_SPLIT_KERNEL(indirect_subsurface);
- REGISTER_SPLIT_KERNEL(buffer_update);
- REGISTER_SPLIT_KERNEL(adaptive_stopping);
- REGISTER_SPLIT_KERNEL(adaptive_filter_x);
- REGISTER_SPLIT_KERNEL(adaptive_filter_y);
- REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
- }
-
- ~CPUDevice()
- {
-#ifdef WITH_EMBREE
- rtcReleaseDevice(embree_device);
-#endif
- task_pool.cancel();
- texture_info.free();
- }
-
- virtual bool show_samples() const override
- {
- return (info.cpu_threads == 1);
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override
- {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
- bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
- return bvh_layout_mask;
- }
-
- void load_texture_info()
- {
- if (need_texture_info) {
- texture_info.copy_to_device();
- need_texture_info = false;
- }
- }
-
- virtual void mem_alloc(device_memory &mem) override
- {
- if (mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else if (mem.type == MEM_GLOBAL) {
- assert(!"mem_alloc not supported for global memory.");
- }
- else {
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
- size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
- void *data = util_aligned_malloc(mem.memory_size(), alignment);
- mem.device_pointer = (device_ptr)data;
- }
- else {
- mem.device_pointer = (device_ptr)mem.host_pointer;
- }
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
- }
-
- virtual void mem_copy_to(device_memory &mem) override
- {
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else if (mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* copy is no-op */
- }
- }
-
- virtual void mem_copy_from(
- device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
- {
- /* no-op */
- }
-
- virtual void mem_zero(device_memory &mem) override
- {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if (mem.device_pointer) {
- memset((void *)mem.device_pointer, 0, mem.memory_size());
- }
- }
-
- virtual void mem_free(device_memory &mem) override
- {
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else if (mem.device_pointer) {
- if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
- util_aligned_free((void *)mem.device_pointer);
- }
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
- {
- return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
- }
-
- virtual void const_copy_to(const char *name, void *host, size_t size) override
- {
-#if WITH_EMBREE
- if (strcmp(name, "__data") == 0) {
- assert(size <= sizeof(KernelData));
-
- // Update scene handle (since it is different for each device on multi devices)
- KernelData *const data = (KernelData *)host;
- data->bvh.scene = embree_scene;
- }
-#endif
- kernel_const_copy(&kernel_globals, name, host, size);
- }
-
- void global_alloc(device_memory &mem)
- {
- VLOG(1) << "Global memory allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
- mem.device_pointer = (device_ptr)mem.host_pointer;
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
-
- void global_free(device_memory &mem)
- {
- if (mem.device_pointer) {
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- void tex_alloc(device_texture &mem)
- {
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- mem.device_pointer = (device_ptr)mem.host_pointer;
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
-
- const uint slot = mem.slot;
- if (slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount of re-allocations. */
- texture_info.resize(slot + 128);
- }
-
- texture_info[slot] = mem.info;
- texture_info[slot].data = (uint64_t)mem.host_pointer;
- need_texture_info = true;
- }
-
- void tex_free(device_texture &mem)
- {
- if (mem.device_pointer) {
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- need_texture_info = true;
- }
- }
-
- virtual void *osl_memory() override
- {
-#ifdef WITH_OSL
- return &osl_globals;
-#else
- return NULL;
-#endif
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
-#ifdef WITH_EMBREE
- if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
- bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
- BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
- if (refit) {
- bvh_embree->refit(progress);
- }
- else {
- bvh_embree->build(progress, &stats, embree_device);
- }
-
- if (bvh->params.top_level) {
- embree_scene = bvh_embree->scene;
- }
- }
- else
-#endif
- Device::build_bvh(bvh, progress, refit);
- }
-
- void thread_run(DeviceTask &task)
- {
- if (task.type == DeviceTask::RENDER)
- thread_render(task);
- else if (task.type == DeviceTask::SHADER)
- thread_shader(task);
- else if (task.type == DeviceTask::FILM_CONVERT)
- thread_film_convert(task);
- else if (task.type == DeviceTask::DENOISE_BUFFER)
- thread_denoise(task);
- }
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
- int4 rect = task->rect;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int w = align_up(rect.z - rect.x, 4);
- int h = rect.w - rect.y;
- int stride = task->buffer.stride;
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
- float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
- float *blurDifference = temporary_mem;
- float *difference = temporary_mem + task->buffer.pass_stride;
- float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
- memset(weightAccum, 0, sizeof(float) * w * h);
- memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
- for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
- int dy = i / (2 * r + 1) - r;
- int dx = i % (2 * r + 1) - r;
-
- int local_rect[4] = {
- max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx,
- dy,
- (float *)guide_ptr,
- (float *)variance_ptr,
- NULL,
- difference,
- local_rect,
- w,
- channel_offset,
- 0,
- a,
- k_2);
-
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
- filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
- filter_nlm_update_output_kernel()(dx,
- dy,
- blurDifference,
- (float *)image_ptr,
- difference,
- (float *)out_ptr,
- weightAccum,
- local_rect,
- channel_offset,
- stride,
- f);
- }
-
- int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
- filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
- return true;
- }
-
- bool denoising_construct_transform(DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
- task->tile_info,
- x + task->filter_area.x,
- y + task->filter_area.y,
- y * task->filter_area.z + x,
- (float *)task->storage.transform.device_pointer,
- (int *)task->storage.rank.device_pointer,
- &task->rect.x,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- task->buffer.use_time,
- task->radius,
- task->pca_threshold);
- }
- }
- return true;
- }
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
- float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
- float *difference = temporary_mem;
- float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
- int r = task->radius;
- int frame_offset = frame * task->buffer.frame_stride;
- for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
- int dy = i / (2 * r + 1) - r;
- int dx = i % (2 * r + 1) - r;
-
- int local_rect[4] = {max(0, -dx),
- max(0, -dy),
- task->reconstruction_state.source_w - max(0, dx),
- task->reconstruction_state.source_h - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx,
- dy,
- (float *)color_ptr,
- (float *)color_variance_ptr,
- (float *)scale_ptr,
- difference,
- local_rect,
- task->buffer.stride,
- task->buffer.pass_stride,
- frame_offset,
- 1.0f,
- task->nlm_k_2);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_calc_weight_kernel()(
- blurDifference, difference, local_rect, task->buffer.stride, 4);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_construct_gramian_kernel()(dx,
- dy,
- task->tile_info->frames[frame],
- blurDifference,
- (float *)task->buffer.mem.device_pointer,
- (float *)task->storage.transform.device_pointer,
- (int *)task->storage.rank.device_pointer,
- (float *)task->storage.XtWX.device_pointer,
- (float3 *)task->storage.XtWY.device_pointer,
- local_rect,
- &task->reconstruction_state.filter_window.x,
- task->buffer.stride,
- 4,
- task->buffer.pass_stride,
- frame_offset,
- task->buffer.use_time);
- }
-
- return true;
- }
-
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
- {
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_finalize_kernel()(x,
- y,
- y * task->filter_area.z + x,
- (float *)output_ptr,
- (int *)task->storage.rank.device_pointer,
- (float *)task->storage.XtWX.device_pointer,
- (float3 *)task->storage.XtWY.device_pointer,
- &task->reconstruction_state.buffer_params.x,
- task->render_buffer.samples);
- }
- }
- return true;
- }
-
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = rect.x; x < rect.z; x++) {
- filter_combine_halves_kernel()(x,
- y,
- (float *)mean_ptr,
- (float *)variance_ptr,
- (float *)a_ptr,
- (float *)b_ptr,
- &rect.x,
- r);
- }
- }
- return true;
- }
-
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_divide_shadow_kernel()(task->render_buffer.samples,
- task->tile_info,
- x,
- y,
- (float *)a_ptr,
- (float *)b_ptr,
- (float *)sample_variance_ptr,
- (float *)sv_variance_ptr,
- (float *)buffer_variance_ptr,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_get_feature_kernel()(task->render_buffer.samples,
- task->tile_info,
- mean_offset,
- variance_offset,
- x,
- y,
- (float *)mean_ptr,
- (float *)variance_ptr,
- scale,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
- {
- for (int y = 0; y < task->filter_area.w; y++) {
- for (int x = 0; x < task->filter_area.z; x++) {
- filter_write_feature_kernel()(task->render_buffer.samples,
- x + task->filter_area.x,
- y + task->filter_area.y,
- &task->reconstruction_state.buffer_params.x,
- (float *)from_ptr,
- (float *)buffer_ptr,
- out_offset,
- &task->rect.x);
- }
- }
- return true;
- }
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
- for (int y = task->rect.y; y < task->rect.w; y++) {
- for (int x = task->rect.x; x < task->rect.z; x++) {
- filter_detect_outliers_kernel()(x,
- y,
- (float *)image_ptr,
- (float *)variance_ptr,
- (float *)depth_ptr,
- (float *)output_ptr,
- &task->rect.x,
- task->buffer.pass_stride);
- }
- }
- return true;
- }
-
- bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
- {
- WorkTile wtile;
- wtile.x = tile.x;
- wtile.y = tile.y;
- wtile.w = tile.w;
- wtile.h = tile.h;
- wtile.offset = tile.offset;
- wtile.stride = tile.stride;
- wtile.buffer = (float *)tile.buffer;
-
- /* For CPU we do adaptive stopping per sample so we can stop earlier, but
- * for combined CPU + GPU rendering we match the GPU and do it per tile
- * after a given number of sample steps. */
- if (!kernel_data.integrator.adaptive_stop_per_sample) {
- for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
- for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
- const int index = wtile.offset + x + y * wtile.stride;
- float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
- kernel_do_adaptive_stopping(kg, buffer, sample);
- }
- }
- }
-
- bool any = false;
- for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
- any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
- }
- for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
- any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
- }
- return (!any);
- }
-
- void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
- {
- float *render_buffer = (float *)tile.buffer;
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- int index = tile.offset + x + y * tile.stride;
- ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
- if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
- buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
- float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
- if (sample_multiplier != 1.0f) {
- kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
- }
- }
- else {
- kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
- }
- }
- }
- }
-
- void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
- {
- const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
- scoped_timer timer(&tile.buffers->render_time);
-
- Coverage coverage(kg, tile);
- if (use_coverage) {
- coverage.init_path_trace();
- }
-
- float *render_buffer = (float *)tile.buffer;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- /* Needed for Embree. */
- SIMD_SET_FLUSH_TO_ZERO;
-
- for (int sample = start_sample; sample < end_sample; sample++) {
- if (task.get_cancel() || TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- break;
- }
-
- if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
- tile.stealing_state = RenderTile::WAS_STOLEN;
- break;
- }
-
- if (tile.task == RenderTile::PATH_TRACE) {
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- if (use_coverage) {
- coverage.init_pixel(x, y);
- }
- path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
- }
- }
- }
- else {
- for (int y = tile.y; y < tile.y + tile.h; y++) {
- for (int x = tile.x; x < tile.x + tile.w; x++) {
- bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
- }
- }
- }
- tile.sample = sample + 1;
-
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
- const bool stop = adaptive_sampling_filter(kg, tile, sample);
- if (stop) {
- const int num_progress_samples = end_sample - sample;
- tile.sample = end_sample;
- task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
- break;
- }
- }
-
- task.update_progress(&tile, tile.w * tile.h);
- }
- if (use_coverage) {
- coverage.finalize();
- }
-
- if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
- adaptive_sampling_post(tile, kg);
- }
- }
-
- void denoise_openimagedenoise_buffer(DeviceTask &task,
- float *buffer,
- const size_t offset,
- const size_t stride,
- const size_t x,
- const size_t y,
- const size_t w,
- const size_t h,
- const float scale)
- {
-#ifdef WITH_OPENIMAGEDENOISE
- assert(openimagedenoise_supported());
-
- /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
- * buffers, and for tiled rendering because creating multiple devices and filters
- * is slow and memory hungry as well.
- *
- * TODO: optimize tiled rendering case, by batching together denoising of many
- * tiles somehow? */
- static thread_mutex mutex;
- thread_scoped_lock lock(mutex);
-
- /* Create device and filter, cached for reuse. */
- if (!oidn_device) {
- oidn_device = oidn::newDevice();
- oidn_device.commit();
- }
- if (!oidn_filter) {
- oidn_filter = oidn_device.newFilter("RT");
- oidn_filter.set("hdr", true);
- oidn_filter.set("srgb", false);
- }
-
- /* Set images with appropriate stride for our interleaved pass storage. */
- struct {
- const char *name;
- const int offset;
- const bool scale;
- const bool use;
- array<float> scaled_buffer;
- } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
- {"albedo",
- task.pass_denoising_data + DENOISING_PASS_ALBEDO,
- true,
- task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
- {"normal",
- task.pass_denoising_data + DENOISING_PASS_NORMAL,
- true,
- task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
- {"output", 0, false, true},
- { NULL,
- 0 }};
-
- for (int i = 0; passes[i].name; i++) {
- if (!passes[i].use) {
- continue;
- }
-
- const int64_t pixel_offset = offset + x + y * stride;
- const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
- const int64_t pixel_stride = task.pass_stride;
- const int64_t row_stride = stride * pixel_stride;
-
- if (passes[i].scale && scale != 1.0f) {
- /* Normalize albedo and normal passes as they are scaled by the number of samples.
- * For the color passes OIDN will perform auto-exposure making it unnecessary. */
- array<float> &scaled_buffer = passes[i].scaled_buffer;
- scaled_buffer.resize(w * h * 3);
-
- for (int y = 0; y < h; y++) {
- const float *pass_row = buffer + buffer_offset + y * row_stride;
- float *scaled_row = scaled_buffer.data() + y * w * 3;
-
- for (int x = 0; x < w; x++) {
- scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
- scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
- scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
- }
- }
-
- oidn_filter.setImage(
- passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
- }
- else {
- oidn_filter.setImage(passes[i].name,
- buffer + buffer_offset,
- oidn::Format::Float3,
- w,
- h,
- 0,
- pixel_stride * sizeof(float),
- row_stride * sizeof(float));
- }
- }
-
- /* Execute filter. */
- oidn_filter.commit();
- oidn_filter.execute();
-#else
- (void)task;
- (void)buffer;
- (void)offset;
- (void)stride;
- (void)x;
- (void)y;
- (void)w;
- (void)h;
- (void)scale;
-#endif
- }
-
- void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
- {
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- /* Copy pixels from compute device to CPU (no-op for CPU device). */
- rtile.buffers->buffer.copy_from_device();
-
- denoise_openimagedenoise_buffer(task,
- (float *)rtile.buffer,
- rtile.offset,
- rtile.stride,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- 1.0f / rtile.sample);
-
- /* todo: it may be possible to avoid this copy, but we have to ensure that
- * when other code copies data from the device it doesn't overwrite the
- * denoiser buffers. */
- rtile.buffers->buffer.copy_to_device();
- }
- else {
- /* Per-tile denoising. */
- rtile.sample = rtile.start_sample + rtile.num_samples;
- const float scale = 1.0f / rtile.sample;
- const float invscale = rtile.sample;
- const size_t pass_stride = task.pass_stride;
-
- /* Map neighboring tiles into one buffer for denoising. */
- RenderTileNeighbors neighbors(rtile);
- task.map_neighbor_tiles(neighbors, this);
- RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
- rtile = center_tile;
-
- /* Calculate size of the tile to denoise (including overlap). The overlap
- * size was chosen empirically. OpenImageDenoise specifies an overlap size
- * of 128 but this is significantly bigger than typical tile size. */
- const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
- const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
- /* Adjacent tiles are in separate memory regions, copy into single buffer. */
- array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &ntile = neighbors.tiles[i];
- if (!ntile.buffer) {
- continue;
- }
-
- const int xmin = max(ntile.x, rect.x);
- const int ymin = max(ntile.y, rect.y);
- const int xmax = min(ntile.x + ntile.w, rect.z);
- const int ymax = min(ntile.y + ntile.h, rect.w);
-
- const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
- const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
- const size_t merged_stride = rect_size.x;
- const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
- float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
- for (int y = ymin; y < ymax; y++) {
- for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
- merged_buffer[x] = tile_buffer[x] * scale;
- }
- tile_buffer += ntile.stride * pass_stride;
- merged_buffer += merged_stride * pass_stride;
- }
- }
-
- /* Denoise */
- denoise_openimagedenoise_buffer(
- task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
- /* Copy back result from merged buffer. */
- RenderTile &ntile = neighbors.target;
- if (ntile.buffer) {
- const int xmin = max(ntile.x, rect.x);
- const int ymin = max(ntile.y, rect.y);
- const int xmax = min(ntile.x + ntile.w, rect.z);
- const int ymax = min(ntile.y + ntile.h, rect.w);
-
- const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
- float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
- const size_t merged_stride = rect_size.x;
- const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
- const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
- for (int y = ymin; y < ymax; y++) {
- for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
- tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
- tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
- tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
- }
- tile_buffer += ntile.stride * pass_stride;
- merged_buffer += merged_stride * pass_stride;
- }
- }
-
- task.unmap_neighbor_tiles(neighbors, this);
- }
- }
-
- void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
- {
- ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoising.functions.construct_transform = function_bind(
- &CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
- denoising.render_buffer.samples = tile.sample;
- denoising.buffer.gpu_temporary_mem = false;
-
- denoising.run_denoising(tile);
- }
-
- void thread_render(DeviceTask &task)
- {
- if (TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- return;
- }
-
- /* allocate buffer for kernel globals */
- device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
- KernelGlobals(thread_kernel_globals_init());
-
- profiler.add_state(&kg->profiler);
-
- CPUSplitKernel *split_kernel = NULL;
- if (use_split_kernel) {
- split_kernel = new CPUSplitKernel(this);
- if (!split_kernel->load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
- kgbuffer.free();
- delete split_kernel;
- return;
- }
- }
-
- /* NLM denoiser. */
- DenoisingTask *denoising = NULL;
-
- /* OpenImageDenoise: we can only denoise with one thread at a time, so to
- * avoid waiting with mutex locks in the denoiser, we let only a single
- * thread acquire denoising tiles. */
- uint tile_types = task.tile_types;
- bool hold_denoise_lock = false;
- if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- if (!oidn_task_lock.try_lock()) {
- tile_types &= ~RenderTile::DENOISE;
- hold_denoise_lock = true;
- }
- }
-
- RenderTile tile;
- while (task.acquire_tile(this, tile, tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- if (use_split_kernel) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
- }
- else {
- render(task, tile, kg);
- }
- }
- else if (tile.task == RenderTile::BAKE) {
- render(task, tile, kg);
- }
- else if (tile.task == RenderTile::DENOISE) {
- if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- denoise_openimagedenoise(task, tile);
- }
- else if (task.denoising.type == DENOISER_NLM) {
- if (denoising == NULL) {
- denoising = new DenoisingTask(this, task);
- denoising->profiler = &kg->profiler;
- }
- denoise_nlm(*denoising, tile);
- }
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
-
- if (TaskPool::canceled()) {
- if (task.need_finish_queue == false)
- break;
- }
- }
-
- if (hold_denoise_lock) {
- oidn_task_lock.unlock();
- }
-
- profiler.remove_state(&kg->profiler);
-
- thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
- kg->~KernelGlobals();
- kgbuffer.free();
- delete split_kernel;
- delete denoising;
- }
-
- void thread_denoise(DeviceTask &task)
- {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- denoise_openimagedenoise(task, tile);
- }
- else {
- DenoisingTask denoising(this, task);
-
- ProfilingState denoising_profiler_state;
- profiler.add_state(&denoising_profiler_state);
- denoising.profiler = &denoising_profiler_state;
-
- denoise_nlm(denoising, tile);
-
- profiler.remove_state(&denoising_profiler_state);
- }
-
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- void thread_film_convert(DeviceTask &task)
- {
- float sample_scale = 1.0f / (task.sample + 1);
-
- if (task.rgba_half) {
- for (int y = task.y; y < task.y + task.h; y++)
- for (int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel()(&kernel_globals,
- (uchar4 *)task.rgba_half,
- (float *)task.buffer,
- sample_scale,
- x,
- y,
- task.offset,
- task.stride);
- }
- else {
- for (int y = task.y; y < task.y + task.h; y++)
- for (int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel()(&kernel_globals,
- (uchar4 *)task.rgba_byte,
- (float *)task.buffer,
- sample_scale,
- x,
- y,
- task.offset,
- task.stride);
- }
- }
-
- void thread_shader(DeviceTask &task)
- {
- KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
- for (int sample = 0; sample < task.num_samples; sample++) {
- for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel()(kg,
- (uint4 *)task.shader_input,
- (float4 *)task.shader_output,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
-
- if (task.get_cancel() || TaskPool::canceled())
- break;
-
- task.update_progress(NULL);
- }
-
- thread_kernel_globals_free(kg);
- delete kg;
- }
-
- virtual int get_split_task_count(DeviceTask &task) override
- {
- if (task.type == DeviceTask::SHADER)
- return task.get_subtask_count(info.cpu_threads, 256);
- else
- return task.get_subtask_count(info.cpu_threads);
- }
-
- virtual void task_add(DeviceTask &task) override
- {
- /* Load texture info. */
- load_texture_info();
-
- /* split task into smaller ones */
- list<DeviceTask> tasks;
-
- if (task.type == DeviceTask::DENOISE_BUFFER &&
- task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
- /* Denoise entire buffer at once with OIDN, it has own threading. */
- tasks.push_back(task);
- }
- else if (task.type == DeviceTask::SHADER) {
- task.split(tasks, info.cpu_threads, 256);
- }
- else {
- task.split(tasks, info.cpu_threads);
- }
-
- foreach (DeviceTask &task, tasks) {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
- }
-
- virtual void task_wait() override
- {
- task_pool.wait_work();
- }
-
- virtual void task_cancel() override
- {
- task_pool.cancel();
- }
-
- protected:
- inline KernelGlobals thread_kernel_globals_init()
- {
- KernelGlobals kg = kernel_globals;
- kg.transparent_shadow_intersections = NULL;
- const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
- sizeof(*kg.decoupled_volume_steps);
- for (int i = 0; i < decoupled_count; ++i) {
- kg.decoupled_volume_steps[i] = NULL;
- }
- kg.decoupled_volume_steps_index = 0;
- kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
- OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
- return kg;
- }
-
- inline void thread_kernel_globals_free(KernelGlobals *kg)
- {
- if (kg == NULL) {
- return;
- }
-
- if (kg->transparent_shadow_intersections != NULL) {
- free(kg->transparent_shadow_intersections);
- }
- const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
- sizeof(*kg->decoupled_volume_steps);
- for (int i = 0; i < decoupled_count; ++i) {
- if (kg->decoupled_volume_steps[i] != NULL) {
- free(kg->decoupled_volume_steps[i]);
- }
- }
-#ifdef WITH_OSL
- OSLShader::thread_free(kg);
-#endif
- }
-
- virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
- {
- requested_features = requested_features_;
-
- return true;
- }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
- CPUDevice *device;
- void (*func)(KernelGlobals *kg, KernelData *data);
-
- CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
- {
- }
- ~CPUSplitKernelFunction()
- {
- }
-
- virtual bool enqueue(const KernelDimensions &dim,
- device_memory &kernel_globals,
- device_memory &data)
- {
- if (!func) {
- return false;
- }
-
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for (int y = 0; y < dim.global_size[1]; y++) {
- for (int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- func(kg, (KernelData *)data.device_pointer);
- }
- }
-
- return true;
- }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &data,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flags,
- device_memory &work_pool_wgs)
-{
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for (int y = 0; y < dim.global_size[1]; y++) {
- for (int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
- (KernelData *)data.device_pointer,
- (void *)split_data.device_pointer,
- num_global_elements,
- (char *)ray_state.device_pointer,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int *)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char *)use_queues_flags.device_pointer,
- (uint *)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float *)rtile.buffer);
- }
- }
-
- return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &)
-{
- CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
- kernel->func = device->split_kernels[kernel_name]();
- if (!kernel->func) {
- delete kernel;
- return NULL;
- }
-
- return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
- return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
- device_memory & /*data*/,
- DeviceTask & /*task*/)
-{
- return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
- device_memory & /*data*/,
- size_t num_threads)
-{
- KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
- return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
- DeviceInfo info;
-
- info.type = DEVICE_CPU;
- info.description = system_cpu_brand_string();
- info.id = "CPU";
- info.num = 0;
- info.has_volume_decoupled = true;
- info.has_adaptive_stop_per_sample = true;
- info.has_osl = true;
- info.has_half_images = true;
- info.has_nanovdb = true;
- info.has_profiling = true;
- info.denoisers = DENOISER_NLM;
- if (openimagedenoise_supported()) {
- info.denoisers |= DENOISER_OPENIMAGEDENOISE;
- }
-
- devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
- string capabilities = "";
- capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
- capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
- capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
- capabilities += system_cpu_support_avx() ? "AVX " : "";
- capabilities += system_cpu_support_avx2() ? "AVX2" : "";
- if (capabilities[capabilities.size() - 1] == ' ')
- capabilities.resize(capabilities.size() - 1);
- return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+ switch (type) {
+ case DENOISER_OPTIX:
+ return "OptiX";
+ case DENOISER_OPENIMAGEDENOISE:
+ return "OpenImageDenoise";
+
+ case DENOISER_NUM:
+ case DENOISER_NONE:
+ case DENOISER_ALL:
+ return "UNKNOWN";
+ }
+
+ return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+ static NodeEnum type_enum;
+
+ if (type_enum.empty()) {
+ type_enum.insert("optix", DENOISER_OPTIX);
+ type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+ }
+
+ return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+ static NodeEnum prefilter_enum;
+
+ if (prefilter_enum.empty()) {
+ prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+ prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+ prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+ }
+
+ return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+ NodeType *type = NodeType::add("denoise_params", create);
+
+ const NodeEnum *type_enum = get_type_enum();
+ const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+ SOCKET_BOOLEAN(use, "Use", false);
+
+ SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+ SOCKET_INT(start_sample, "Start Sample", 0);
+
+ SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+ SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+ SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+ return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..dfdc7cc87b3
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+ DENOISER_OPTIX = 2,
+ DENOISER_OPENIMAGEDENOISE = 4,
+ DENOISER_NUM,
+
+ DENOISER_NONE = 0,
+ DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+ /* Best quality of the result without extra processing time, but requires guiding passes to be
+ * noise-free. */
+ DENOISER_PREFILTER_NONE = 1,
+
+ /* Denoise color and guiding passes together.
+ * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+ DENOISER_PREFILTER_FAST = 2,
+
+ /* Prefilter noisy guiding passes before denoising color.
+ * Improves quality when guiding passes are noisy using extra processing time. */
+ DENOISER_PREFILTER_ACCURATE = 3,
+
+ DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+ NODE_DECLARE
+
+ /* Apply denoiser to image. */
+ bool use = false;
+
+ /* Denoiser type. */
+ DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+ /* Viewport start sample. */
+ int start_sample = 0;
+
+ /* Auxiliary passes. */
+ bool use_pass_albedo = true;
+ bool use_pass_normal = true;
+
+ DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+ static const NodeEnum *get_type_enum();
+ static const NodeEnum *get_prefilter_enum();
+
+ DenoiseParams();
+
+ bool modified(const DenoiseParams &other) const
+ {
+ return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+ use_pass_albedo == other.use_pass_albedo &&
+ use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+ }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+ DenoiseParams params;
+
+ int num_samples;
+
+ RenderBuffers *render_buffers;
+ BufferParams buffer_params;
+
+ /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+ * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+ * tracer) point of view. */
+ bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
- : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
- profiler(NULL),
- storage(device),
- buffer(device),
- device(device)
-{
- radius = task.denoising.radius;
- nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
- if (task.denoising.relative_pca) {
- pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
- }
- else {
- pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
- }
-
- render_buffer.frame_stride = task.frame_stride;
- render_buffer.pass_stride = task.pass_stride;
- render_buffer.offset = task.pass_denoising_data;
-
- target_buffer.pass_stride = task.target_pass_stride;
- target_buffer.denoising_clean_offset = task.pass_denoising_clean;
- target_buffer.offset = 0;
-
- functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
- functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
- tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
- tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
- tile_info->frames[0] = 0;
- tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
- for (int i = 1; i < tile_info->num_frames; i++) {
- tile_info->frames[i] = task.denoising_frames[i - 1];
- }
-
- do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
- do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
- storage.XtWX.free();
- storage.XtWY.free();
- storage.transform.free();
- storage.rank.free();
- buffer.mem.free();
- buffer.temporary_mem.free();
- tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &rtile = neighbors.tiles[i];
- tile_info->offsets[i] = rtile.offset;
- tile_info->strides[i] = rtile.stride;
- tile_info->buffers[i] = rtile.buffer;
- }
- tile_info->x[0] = neighbors.tiles[3].x;
- tile_info->x[1] = neighbors.tiles[4].x;
- tile_info->x[2] = neighbors.tiles[5].x;
- tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
- tile_info->y[0] = neighbors.tiles[1].y;
- tile_info->y[1] = neighbors.tiles[4].y;
- tile_info->y[2] = neighbors.tiles[7].y;
- tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
- target_buffer.offset = neighbors.target.offset;
- target_buffer.stride = neighbors.target.stride;
- target_buffer.ptr = neighbors.target.buffer;
-
- if (do_prefilter && neighbors.target.buffers) {
- target_buffer.denoising_output_offset =
- neighbors.target.buffers->params.get_denoising_prefiltered_offset();
- }
- else {
- target_buffer.denoising_output_offset = 0;
- }
-
- tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
- /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
- * tiles */
- rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
- rect = rect_expand(rect, radius);
- rect = rect_clip(rect,
- make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
- buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
- buffer.passes = buffer.use_intensity ? 15 : 14;
- buffer.width = rect.z - rect.x;
- buffer.stride = align_up(buffer.width, 4);
- buffer.h = rect.w - rect.y;
- int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
- buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
- buffer.frame_stride = buffer.pass_stride * buffer.passes;
- /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
- buffer.mem.alloc_to_device(mem_size, false);
- buffer.use_time = (tile_info->num_frames > 1);
-
- /* CPUs process shifts sequentially while GPUs process them in parallel. */
- int num_layers;
- if (buffer.gpu_temporary_mem) {
- /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
- int max_radius = max(radius, 6);
- int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
- num_layers = 2 * num_shifts + 1;
- }
- else {
- num_layers = 3;
- }
- /* Allocate two layers per shift as well as one for the weight accumulation. */
- buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
- device_ptr null_ptr = (device_ptr)0;
-
- device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
- /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
- * sample variance and the buffer variance. */
- functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
- * sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
- functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
- /* Reuse memory, the previous data isn't needed anymore. */
- device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
- /* Use the smoothed variance to filter the two shadow half images using each other for weight
- * calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
- functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
- device_ptr residual_var = *sample_var_var;
- /* Estimate the residual variance between the two filtered halves. */
- functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
- device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
- /* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
- functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
- functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
- /* Combine the two double-filtered halves to a final shadow feature. */
- device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
- functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
- device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
- int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
- int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
- int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
- for (int pass = 0; pass < 7; pass++) {
- device_sub_ptr feature_pass(
- buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
- /* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass],
- variance_from[pass],
- *unfiltered,
- *variance,
- 1.0f / render_buffer.samples);
- /* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
- }
-}
-
-void DenoisingTask::prefilter_color()
-{
- int mean_from[] = {20, 21, 22};
- int variance_from[] = {23, 24, 25};
- int mean_to[] = {8, 9, 10};
- int variance_to[] = {11, 12, 13};
- int num_color_passes = 3;
-
- device_only_memory<float> temporary_color(device, "denoising temporary color");
- temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
- for (int pass = 0; pass < num_color_passes; pass++) {
- device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr color_var_pass(
- temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass],
- variance_from[pass],
- *color_pass,
- *color_var_pass,
- 1.0f / render_buffer.samples);
- }
-
- device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr color_var_pass(
- buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
- device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
- functions.detect_outliers(
- temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
- if (buffer.use_intensity) {
- device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
- nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
- functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
- }
-}
-
-void DenoisingTask::load_buffer()
-{
- device_ptr null_ptr = (device_ptr)0;
-
- int original_offset = render_buffer.offset;
-
- int num_passes = buffer.use_intensity ? 15 : 14;
- for (int i = 0; i < tile_info->num_frames; i++) {
- for (int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr to_pass(
- buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
- bool is_variance = (pass >= 11) && (pass <= 13);
- functions.get_feature(
- pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
- }
- render_buffer.offset += render_buffer.frame_stride;
- }
-
- render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
- reconstruction_state.buffer_params = make_int4(target_buffer.offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- int num_passes = buffer.use_intensity ? 15 : 14;
- for (int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
- int out_offset = pass + target_buffer.denoising_output_offset;
- functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
- }
-}
-
-void DenoisingTask::construct_transform()
-{
- storage.w = filter_area.z;
- storage.h = filter_area.w;
-
- storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
- storage.rank.alloc_to_device(storage.w * storage.h, false);
-
- functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
- storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
- storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
- storage.XtWX.zero_to_device();
- storage.XtWY.zero_to_device();
-
- reconstruction_state.filter_window = rect_from_shape(
- filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
- int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
- reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- reconstruction_state.source_w = rect.z - rect.x;
- reconstruction_state.source_h = rect.w - rect.y;
-
- device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
- device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
- for (int f = 0; f < tile_info->num_frames; f++) {
- device_ptr scale_ptr = 0;
- device_sub_ptr *scale_sub_ptr = NULL;
- if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
- scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
- scale_ptr = **scale_sub_ptr;
- }
-
- functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
- delete scale_sub_ptr;
- }
- functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
- RenderTileNeighbors neighbors(tile);
- functions.map_neighbor_tiles(neighbors);
- set_render_buffer(neighbors);
-
- setup_denoising_buffer();
-
- if (tile_info->from_render) {
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
- }
- else {
- load_buffer();
- }
-
- if (do_filter) {
- construct_transform();
- reconstruct();
- }
-
- if (do_prefilter) {
- write_buffer();
- }
-
- functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
- /* Parameters of the denoising algorithm. */
- int radius;
- float nlm_k_2;
- float pca_threshold;
-
- /* Parameters of the RenderBuffers. */
- struct RenderBuffers {
- int offset;
- int pass_stride;
- int frame_stride;
- int samples;
- } render_buffer;
-
- /* Pointer and parameters of the target buffer. */
- struct TargetBuffer {
- int offset;
- int stride;
- int pass_stride;
- int denoising_clean_offset;
- int denoising_output_offset;
- device_ptr ptr;
- } target_buffer;
-
- TileInfo *tile_info;
- device_vector<int> tile_info_mem;
-
- ProfilingState *profiler;
-
- int4 rect;
- int4 filter_area;
-
- bool do_prefilter;
- bool do_filter;
-
- struct DeviceFunctions {
- function<bool(
- device_ptr image_ptr, /* Contains the values that are smoothed. */
- device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
- device_ptr variance_ptr, /* Contains the variance of the guide image. */
- device_ptr out_ptr /* The filtered output is written into this image. */
- )>
- non_local_means;
- function<bool(
- device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
- accumulate;
- function<bool(device_ptr output_ptr)> solve;
- function<bool()> construct_transform;
-
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect)>
- combine_halves;
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr)>
- divide_shadow;
- function<bool(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale)>
- get_feature;
- function<bool(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr)>
- detect_outliers;
- function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
- function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
- function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
- } functions;
-
- /* Stores state of the current Reconstruction operation,
- * which is accessed by the device in order to perform the operation. */
- struct ReconstructionState {
- int4 filter_window;
- int4 buffer_params;
-
- int source_w;
- int source_h;
- } reconstruction_state;
-
- /* Stores state of the current NLM operation,
- * which is accessed by the device in order to perform the operation. */
- struct NLMState {
- int r; /* Search radius of the filter. */
- int f; /* Patch size of the filter. */
- float a; /* Variance compensation factor in the MSE estimation. */
- float k_2; /* Squared value of the k parameter of the filter. */
- bool is_color;
-
- void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
- {
- r = r_;
- f = f_;
- a = a_, k_2 = k_2_;
- is_color = is_color_;
- }
- } nlm_state;
-
- struct Storage {
- device_only_memory<float> transform;
- device_only_memory<int> rank;
- device_only_memory<float> XtWX;
- device_only_memory<float3> XtWY;
- int w;
- int h;
-
- Storage(Device *device)
- : transform(device, "denoising transform"),
- rank(device, "denoising rank"),
- XtWX(device, "denoising XtWX"),
- XtWY(device, "denoising XtWY")
- {
- }
- } storage;
-
- DenoisingTask(Device *device, const DeviceTask &task);
- ~DenoisingTask();
-
- void run_denoising(RenderTile &tile);
-
- struct DenoiseBuffers {
- int pass_stride;
- int passes;
- int stride;
- int h;
- int width;
- int frame_stride;
- device_only_memory<float> mem;
- device_only_memory<float> temporary_mem;
- bool use_time;
- bool use_intensity;
-
- bool gpu_temporary_mem;
-
- DenoiseBuffers(Device *device)
- : mem(device, "denoising pixel buffer"),
- temporary_mem(device, "denoising temporary mem", true)
- {
- }
- } buffer;
-
- protected:
- Device *device;
-
- void set_render_buffer(RenderTileNeighbors &neighbors);
- void setup_denoising_buffer();
- void prefilter_shadowing();
- void prefilter_features();
- void prefilter_color();
- void construct_transform();
- void reconstruct();
-
- void load_buffer();
- void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/device/device_graphics_interop.cpp
index fa210e747c0..a80a236759f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,8 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_path_init.h"
+#include "device/device_graphics_interop.h"
-#define KERNEL_NAME path_init
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+CCL_NAMESPACE_BEGIN
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+ /* Dimensions of the buffer, in pixels. */
+ int buffer_width = 0;
+ int buffer_height = 0;
+
+ /* OpenGL pixel buffer object. */
+ int opengl_pbo_id = 0;
+
+ /* Clear the entire destination before doing partial write to it. */
+ bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+ DeviceGraphicsInterop() = default;
+ virtual ~DeviceGraphicsInterop() = default;
+
+ /* Update this device-side graphics interoperability object with the given destination resource
+ * information. */
+ virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+ virtual device_ptr map() = 0;
+ virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+ switch (kernel) {
+ /* Integrator. */
+ case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+ return "integrator_init_from_camera";
+ case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+ return "integrator_init_from_bake";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ return "integrator_intersect_closest";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ return "integrator_intersect_shadow";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ return "integrator_intersect_subsurface";
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ return "integrator_intersect_volume_stack";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ return "integrator_shade_background";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ return "integrator_shade_light";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ return "integrator_shade_shadow";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ return "integrator_shade_surface";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ return "integrator_shade_surface_raytrace";
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+ return "integrator_shade_volume";
+ case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+ return "integrator_megakernel";
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+ return "integrator_queued_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+ return "integrator_queued_shadow_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+ return "integrator_active_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+ return "integrator_terminated_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+ return "integrator_sorted_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+ return "integrator_compact_paths_array";
+ case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+ return "integrator_compact_states";
+ case DEVICE_KERNEL_INTEGRATOR_RESET:
+ return "integrator_reset";
+ case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+ return "integrator_shadow_catcher_count_possible_splits";
+
+ /* Shader evaluation. */
+ case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+ return "shader_eval_displace";
+ case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+ return "shader_eval_background";
+
+ /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+ case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+ return "film_convert_" #variant_lowercase; \
+ case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+ return "film_convert_" #variant_lowercase "_half_rgba";
+
+ FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+ FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+ FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+ FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+ FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+ FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+ FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+ FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+ shadow_catcher_matte_with_shadow)
+ FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+ FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+ /* Adaptive sampling. */
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+ return "adaptive_sampling_convergence_check";
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+ return "adaptive_sampling_filter_x";
+ case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+ return "adaptive_sampling_filter_y";
+
+ /* Denoising. */
+ case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+ return "filter_guiding_preprocess";
+ case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+ return "filter_guiding_set_fake_albedo";
+ case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+ return "filter_color_preprocess";
+ case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+ return "filter_color_postprocess";
+
+ /* Cryptomatte. */
+ case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+ return "cryptomatte_postprocess";
+
+ /* Generic */
+ case DEVICE_KERNEL_PREFIX_SUM:
+ return "prefix_sum";
+
+ case DEVICE_KERNEL_NUM:
+ break;
+ };
+ LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+ return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+ os << device_kernel_as_string(kernel);
+ return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+ string str;
+
+ for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+ if (mask & (uint64_t(1) << i)) {
+ if (!str.empty()) {
+ str += " ";
+ }
+ str += device_kernel_as_string((DeviceKernel)i);
+ }
+ }
+
+ return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/device/device_kernel.h
index 9e1e57beba6..83d959ca87b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/device/device_kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,13 +14,20 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#pragma once
-#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
-#define LOCALS_TYPE BackgroundAOLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "kernel/kernel_types.h"
+#include "util/util_string.h"
+
+#include <ostream> // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
device_memory::device_memory(Device *device, const char *name, MemoryType type)
: data_type(device_type_traits<uchar>::data_type),
- data_elements(device_type_traits<uchar>::num_elements),
+ data_elements(device_type_traits<uchar>::num_elements_cpu),
data_size(0),
device_size(0),
data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
}
}
+bool device_memory::device_is_cpu()
+{
+ return (device->info.type == DEVICE_CPU);
+}
+
void device_memory::swap_device(Device *new_device,
size_t new_device_size,
device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
MEM_DEVICE_ONLY,
MEM_GLOBAL,
MEM_TEXTURE,
- MEM_PIXELS
};
/* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
TYPE_UINT64,
};
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
{
switch (datatype) {
case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
template<typename T> struct device_type_traits {
static const DataType data_type = TYPE_UNKNOWN;
- static const int num_elements = sizeof(T);
+ static const int num_elements_cpu = sizeof(T);
+ static const int num_elements_gpu = sizeof(T);
};
template<> struct device_type_traits<uchar> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar2> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar3> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 3;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uchar4> {
static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint2> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint3> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 3;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint4> {
static const DataType data_type = TYPE_UINT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int2> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int3> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 3;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<int4> {
static const DataType data_type = TYPE_INT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float2> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 2;
+ static const int num_elements_cpu = 2;
+ static const int num_elements_gpu = 2;
+ static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float3> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 3;
+ static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<float4> {
static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<half> {
static const DataType data_type = TYPE_HALF;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<ushort4> {
static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint16_t> {
static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<half4> {
static const DataType data_type = TYPE_HALF;
- static const int num_elements = 4;
+ static const int num_elements_cpu = 4;
+ static const int num_elements_gpu = 4;
+ static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
};
template<> struct device_type_traits<uint64_t> {
static const DataType data_type = TYPE_UINT64;
- static const int num_elements = 1;
+ static const int num_elements_cpu = 1;
+ static const int num_elements_gpu = 1;
+ static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
};
/* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
void device_copy_from(int y, int w, int h, int elem);
void device_zero();
+ bool device_is_cpu();
+
device_ptr original_device_ptr;
size_t original_device_size;
Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
: device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
{
data_type = device_type_traits<T>::data_type;
- data_elements = max(device_type_traits<T>::num_elements, 1);
+ data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+ device_type_traits<T>::num_elements_gpu,
+ 1);
}
device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
template<typename T> class device_vector : public device_memory {
public:
+ /* Can only use this for types that have the same size on CPU and GPU. */
+ static_assert(device_type_traits<T>::num_elements_cpu ==
+ device_type_traits<T>::num_elements_gpu);
+
device_vector(Device *device, const char *name, MemoryType type)
: device_memory(device, name, type)
{
data_type = device_type_traits<T>::data_type;
- data_elements = device_type_traits<T>::num_elements;
+ data_elements = device_type_traits<T>::num_elements_cpu;
modified = true;
need_realloc_ = true;
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
return (T *)host_pointer;
}
+ const T *data() const
+ {
+ return (T *)host_pointer;
+ }
+
T &operator[](size_t i)
{
assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
void copy_from_device()
{
- device_copy_from(0, data_width, data_height, sizeof(T));
+ device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
}
void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
}
};
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
- device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
- {
- }
-
- void alloc_to_device(size_t width, size_t height, size_t depth = 0)
- {
- device_vector<T>::alloc(width, height, depth);
-
- if (!device_memory::device_pointer) {
- device_memory::device_alloc();
- }
- }
-
- T *copy_from_device(int y, int w, int h)
- {
- device_memory::device_copy_from(y, w, h, sizeof(T));
- return device_vector<T>::data();
- }
-};
-
/* Device Sub Memory
*
* Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
- struct SubDevice {
- Stats stats;
- Device *device;
- map<device_ptr, device_ptr> ptr_map;
- int peer_island_index = -1;
- };
-
- list<SubDevice> devices, denoising_devices;
- device_ptr unique_key;
- vector<vector<SubDevice *>> peer_islands;
- bool use_denoising;
- bool matching_rendering_and_denoising_devices;
-
- MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_),
- unique_key(1),
- use_denoising(!info.denoising_devices.empty())
- {
- foreach (DeviceInfo &subinfo, info.multi_devices) {
- /* Always add CPU devices at the back since GPU devices can change
- * host memory pointers, which CPU uses as device pointer. */
- SubDevice *sub;
- if (subinfo.type == DEVICE_CPU) {
- devices.emplace_back();
- sub = &devices.back();
- }
- else {
- devices.emplace_front();
- sub = &devices.front();
- }
-
- /* The pointer to 'sub->stats' will stay valid even after new devices
- * are added, since 'devices' is a linked list. */
- sub->device = Device::create(subinfo, sub->stats, profiler, background);
- }
-
- foreach (DeviceInfo &subinfo, info.denoising_devices) {
- denoising_devices.emplace_front();
- SubDevice *sub = &denoising_devices.front();
-
- sub->device = Device::create(subinfo, sub->stats, profiler, background);
- }
-
- /* Build a list of peer islands for the available render devices */
- foreach (SubDevice &sub, devices) {
- /* First ensure that every device is in at least once peer island */
- if (sub.peer_island_index < 0) {
- peer_islands.emplace_back();
- sub.peer_island_index = (int)peer_islands.size() - 1;
- peer_islands[sub.peer_island_index].push_back(&sub);
- }
-
- if (!info.has_peer_memory) {
- continue;
- }
-
- /* Second check peer access between devices and fill up the islands accordingly */
- foreach (SubDevice &peer_sub, devices) {
- if (peer_sub.peer_island_index < 0 &&
- peer_sub.device->info.type == sub.device->info.type &&
- peer_sub.device->check_peer_access(sub.device)) {
- peer_sub.peer_island_index = sub.peer_island_index;
- peer_islands[sub.peer_island_index].push_back(&peer_sub);
- }
- }
- }
-
- /* Try to re-use memory when denoising and render devices use the same physical devices
- * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
- * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
- matching_rendering_and_denoising_devices = denoising_devices.empty() ||
- (devices.size() == denoising_devices.size());
- if (matching_rendering_and_denoising_devices) {
- for (list<SubDevice>::iterator device_it = devices.begin(),
- denoising_device_it = denoising_devices.begin();
- device_it != devices.end() && denoising_device_it != denoising_devices.end();
- ++device_it, ++denoising_device_it) {
- const DeviceInfo &info = device_it->device->info;
- const DeviceInfo &denoising_info = denoising_device_it->device->info;
- if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
- (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
- info.num != denoising_info.num) {
- matching_rendering_and_denoising_devices = false;
- break;
- }
- }
- }
-
-#ifdef WITH_NETWORK
- /* try to add network devices */
- ServerDiscovery discovery(true);
- time_sleep(1.0);
-
- vector<string> servers = discovery.get_server_list();
-
- foreach (string &server, servers) {
- Device *device = device_network_create(info, stats, profiler, server.c_str());
- if (device)
- devices.push_back(SubDevice(device));
- }
-#endif
- }
-
- ~MultiDevice()
- {
- foreach (SubDevice &sub, devices)
- delete sub.device;
- foreach (SubDevice &sub, denoising_devices)
- delete sub.device;
- }
-
- const string &error_message() override
- {
- error_msg.clear();
-
- foreach (SubDevice &sub, devices)
- error_msg += sub.device->error_message();
- foreach (SubDevice &sub, denoising_devices)
- error_msg += sub.device->error_message();
-
- return error_msg;
- }
-
- virtual bool show_samples() const override
- {
- if (devices.size() > 1) {
- return false;
- }
- return devices.front().device->show_samples();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const override
- {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
- BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
- foreach (const SubDevice &sub_device, devices) {
- BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
- bvh_layout_mask &= device_bvh_layout_mask;
- bvh_layout_mask_all |= device_bvh_layout_mask;
- }
-
- /* With multiple OptiX devices, every device needs its own acceleration structure */
- if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
- return BVH_LAYOUT_MULTI_OPTIX;
- }
-
- /* When devices do not share a common BVH layout, fall back to creating one for each */
- const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
- if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
- return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
- }
-
- return bvh_layout_mask;
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features) override
- {
- foreach (SubDevice &sub, devices)
- if (!sub.device->load_kernels(requested_features))
- return false;
-
- use_denoising = requested_features.use_denoising;
- if (requested_features.use_denoising) {
- /* Only need denoising feature, everything else is unused. */
- DeviceRequestedFeatures denoising_features;
- denoising_features.use_denoising = true;
- foreach (SubDevice &sub, denoising_devices)
- if (!sub.device->load_kernels(denoising_features))
- return false;
- }
-
- return true;
- }
-
- bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
- {
- foreach (SubDevice &sub, devices)
- if (!sub.device->wait_for_availability(requested_features))
- return false;
-
- if (requested_features.use_denoising) {
- foreach (SubDevice &sub, denoising_devices)
- if (!sub.device->wait_for_availability(requested_features))
- return false;
- }
-
- return true;
- }
-
- DeviceKernelStatus get_active_kernel_switch_state() override
- {
- DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
- foreach (SubDevice &sub, devices) {
- DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
- switch (subresult) {
- case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
- case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
- return subresult;
-
- case DEVICE_KERNEL_USING_FEATURE_KERNEL:
- case DEVICE_KERNEL_UNKNOWN:
- break;
- }
- }
-
- return result;
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
- /* Try to build and share a single acceleration structure, if possible */
- if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
- devices.back().device->build_bvh(bvh, progress, refit);
- return;
- }
-
- assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
- bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
- BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
- bvh_multi->sub_bvhs.resize(devices.size());
-
- vector<BVHMulti *> geom_bvhs;
- geom_bvhs.reserve(bvh->geometry.size());
- foreach (Geometry *geom, bvh->geometry) {
- geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
- }
-
- /* Broadcast acceleration structure build to all render devices */
- size_t i = 0;
- foreach (SubDevice &sub, devices) {
- /* Change geometry BVH pointers to the sub BVH */
- for (size_t k = 0; k < bvh->geometry.size(); ++k) {
- bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
- }
-
- if (!bvh_multi->sub_bvhs[i]) {
- BVHParams params = bvh->params;
- if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
- params.bvh_layout = BVH_LAYOUT_OPTIX;
- else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
- params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
- BVH_LAYOUT_EMBREE;
-
- /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
- * (since they are put into the top level directly, see bvh_embree.cpp) */
- if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
- !bvh->geometry[0]->is_instanced()) {
- i++;
- continue;
- }
-
- bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
- }
-
- sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
- i++;
- }
-
- /* Change geometry BVH pointers back to the multi BVH. */
- for (size_t k = 0; k < bvh->geometry.size(); ++k) {
- bvh->geometry[k]->bvh = geom_bvhs[k];
- }
- }
-
- virtual void *osl_memory() override
- {
- if (devices.size() > 1) {
- return NULL;
- }
- return devices.front().device->osl_memory();
- }
-
- bool is_resident(device_ptr key, Device *sub_device) override
- {
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device) {
- return find_matching_mem_device(key, sub)->device == sub_device;
- }
- }
- return false;
- }
-
- SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
- {
- assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
- /* Get the memory owner of this key (first try current device, then peer devices) */
- SubDevice *owner_sub = &sub;
- if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
- foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
- if (island_sub != owner_sub &&
- island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
- owner_sub = island_sub;
- }
- }
- }
- return owner_sub;
- }
-
- SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
- {
- assert(!island.empty());
-
- /* Get the memory owner of this key or the device with the lowest memory usage when new */
- SubDevice *owner_sub = island.front();
- foreach (SubDevice *island_sub, island) {
- if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
- (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
- owner_sub = island_sub;
- }
- }
- return owner_sub;
- }
-
- inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
- {
- return find_matching_mem_device(key, sub)->ptr_map[key];
- }
-
- void mem_alloc(device_memory &mem) override
- {
- device_ptr key = unique_key++;
-
- if (mem.type == MEM_PIXELS) {
- /* Always allocate pixels memory on all devices
- * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- sub.device->mem_alloc(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- else {
- assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
- mem.type == MEM_DEVICE_ONLY);
- /* The remaining memory types can be distributed across devices */
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- owner_sub->device->mem_alloc(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size);
- }
-
- void mem_copy_to(device_memory &mem) override
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key) ? existing_key : unique_key++;
- size_t existing_size = mem.device_size;
-
- /* The tile buffers are allocated on each device (see below), so copy to all of them */
- if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_copy_to(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- owner_sub->device->mem_copy_to(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
-
- if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
- /* Need to create texture objects and update pointer in kernel globals on all devices */
- foreach (SubDevice *island_sub, island) {
- if (island_sub != owner_sub) {
- island_sub->device->mem_copy_to(mem);
- }
- }
- }
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
- {
- device_ptr key = mem.device_pointer;
- int i = 0, sub_h = h / devices.size();
-
- foreach (SubDevice &sub, devices) {
- int sy = y + i * sub_h;
- int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
- SubDevice *owner_sub = find_matching_mem_device(key, sub);
- mem.device = owner_sub->device;
- mem.device_pointer = owner_sub->ptr_map[key];
-
- owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
- i++;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- }
-
- void mem_zero(device_memory &mem) override
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key) ? existing_key : unique_key++;
- size_t existing_size = mem.device_size;
-
- /* This is a hack to only allocate the tile buffers on denoising devices
- * Similarly the tile buffers also need to be allocated separately on all devices so any
- * overlap rendered for denoising does not interfere with each other */
- if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
- vector<device_ptr> device_pointers;
- device_pointers.reserve(devices.size());
-
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
-
- device_pointers.push_back(mem.device_pointer);
- }
- foreach (SubDevice &sub, denoising_devices) {
- if (matching_rendering_and_denoising_devices) {
- sub.ptr_map[key] = device_pointers.front();
- device_pointers.erase(device_pointers.begin());
- }
- else {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
- mem.device = owner_sub->device;
- mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- owner_sub->device->mem_zero(mem);
- owner_sub->ptr_map[key] = mem.device_pointer;
- }
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_free(device_memory &mem) override
- {
- device_ptr key = mem.device_pointer;
- size_t existing_size = mem.device_size;
-
- /* Free memory that was allocated for all devices (see above) on each device */
- if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
- foreach (SubDevice &sub, denoising_devices) {
- if (matching_rendering_and_denoising_devices) {
- sub.ptr_map.erase(key);
- }
- else {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
- }
- }
- else {
- foreach (const vector<SubDevice *> &island, peer_islands) {
- SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
- mem.device = owner_sub->device;
- mem.device_pointer = owner_sub->ptr_map[key];
- mem.device_size = existing_size;
-
- owner_sub->device->mem_free(mem);
- owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
- if (mem.type == MEM_TEXTURE) {
- /* Free texture objects on all devices */
- foreach (SubDevice *island_sub, island) {
- if (island_sub != owner_sub) {
- island_sub->device->mem_free(mem);
- }
- }
- }
- }
- }
-
- mem.device = this;
- mem.device_pointer = 0;
- mem.device_size = 0;
- stats.mem_free(existing_size);
- }
-
- void const_copy_to(const char *name, void *host, size_t size) override
- {
- foreach (SubDevice &sub, devices)
- sub.device->const_copy_to(name, host, size);
- }
-
- void draw_pixels(device_memory &rgba,
- int y,
- int w,
- int h,
- int width,
- int height,
- int dx,
- int dy,
- int dw,
- int dh,
- bool transparent,
- const DeviceDrawParams &draw_params) override
- {
- assert(rgba.type == MEM_PIXELS);
-
- device_ptr key = rgba.device_pointer;
- int i = 0, sub_h = h / devices.size();
- int sub_height = height / devices.size();
-
- foreach (SubDevice &sub, devices) {
- int sy = y + i * sub_h;
- int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
- int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
- int sdy = dy + i * sub_height;
- /* adjust math for w/width */
-
- rgba.device_pointer = sub.ptr_map[key];
- sub.device->draw_pixels(
- rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
- i++;
- }
-
- rgba.device_pointer = key;
- }
-
- void map_tile(Device *sub_device, RenderTile &tile) override
- {
- if (!tile.buffer) {
- return;
- }
-
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device) {
- tile.buffer = find_matching_mem(tile.buffer, sub);
- return;
- }
- }
-
- foreach (SubDevice &sub, denoising_devices) {
- if (sub.device == sub_device) {
- tile.buffer = sub.ptr_map[tile.buffer];
- return;
- }
- }
- }
-
- int device_number(Device *sub_device) override
- {
- int i = 0;
-
- foreach (SubDevice &sub, devices) {
- if (sub.device == sub_device)
- return i;
- i++;
- }
-
- foreach (SubDevice &sub, denoising_devices) {
- if (sub.device == sub_device)
- return i;
- i++;
- }
-
- return -1;
- }
-
- void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
- {
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &tile = neighbors.tiles[i];
-
- if (!tile.buffers) {
- continue;
- }
-
- device_vector<float> &mem = tile.buffers->buffer;
- tile.buffer = mem.device_pointer;
-
- if (mem.device == this && matching_rendering_and_denoising_devices) {
- /* Skip unnecessary copies in viewport mode (buffer covers the
- * whole image), but still need to fix up the tile device pointer. */
- map_tile(sub_device, tile);
- continue;
- }
-
- /* If the tile was rendered on another device, copy its memory to
- * to the current device now, for the duration of the denoising task.
- * Note that this temporarily modifies the RenderBuffers and calls
- * the device, so this function is not thread safe. */
- if (mem.device != sub_device) {
- /* Only copy from device to host once. This is faster, but
- * also required for the case where a CPU thread is denoising
- * a tile rendered on the GPU. In that case we have to avoid
- * overwriting the buffer being de-noised by the CPU thread. */
- if (!tile.buffers->map_neighbor_copied) {
- tile.buffers->map_neighbor_copied = true;
- mem.copy_from_device();
- }
-
- if (mem.device == this) {
- /* Can re-use memory if tile is already allocated on the sub device. */
- map_tile(sub_device, tile);
- mem.swap_device(sub_device, mem.device_size, tile.buffer);
- }
- else {
- mem.swap_device(sub_device, 0, 0);
- }
-
- mem.copy_to_device();
-
- tile.buffer = mem.device_pointer;
- tile.device_size = mem.device_size;
-
- mem.restore_device();
- }
- }
- }
-
- void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
- {
- RenderTile &target_tile = neighbors.target;
- device_vector<float> &mem = target_tile.buffers->buffer;
-
- if (mem.device == this && matching_rendering_and_denoising_devices) {
- return;
- }
-
- /* Copy denoised result back to the host. */
- mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
- mem.copy_from_device();
- mem.restore_device();
-
- /* Copy denoised result to the original device. */
- mem.copy_to_device();
-
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- RenderTile &tile = neighbors.tiles[i];
- if (!tile.buffers) {
- continue;
- }
-
- device_vector<float> &mem = tile.buffers->buffer;
-
- if (mem.device != sub_device && mem.device != this) {
- /* Free up memory again if it was allocated for the copy above. */
- mem.swap_device(sub_device, tile.device_size, tile.buffer);
- sub_device->mem_free(mem);
- mem.restore_device();
- }
- }
- }
-
- int get_split_task_count(DeviceTask &task) override
- {
- int total_tasks = 0;
- list<DeviceTask> tasks;
- task.split(tasks, devices.size());
- foreach (SubDevice &sub, devices) {
- if (!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- total_tasks += sub.device->get_split_task_count(subtask);
- }
- }
- return total_tasks;
- }
-
- void task_add(DeviceTask &task) override
- {
- list<SubDevice> task_devices = devices;
- if (!denoising_devices.empty()) {
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- /* Denoising tasks should be redirected to the denoising devices entirely. */
- task_devices = denoising_devices;
- }
- else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
- const uint tile_types = task.tile_types;
- /* For normal rendering tasks only redirect the denoising part to the denoising devices.
- * Do not need to split the task here, since they all run through 'acquire_tile'. */
- task.tile_types = RenderTile::DENOISE;
- foreach (SubDevice &sub, denoising_devices) {
- sub.device->task_add(task);
- }
- /* Rendering itself should still be executed on the rendering devices. */
- task.tile_types = tile_types ^ RenderTile::DENOISE;
- }
- }
-
- list<DeviceTask> tasks;
- task.split(tasks, task_devices.size());
-
- foreach (SubDevice &sub, task_devices) {
- if (!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- if (task.buffer)
- subtask.buffer = find_matching_mem(task.buffer, sub);
- if (task.rgba_byte)
- subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
- if (task.rgba_half)
- subtask.rgba_half = sub.ptr_map[task.rgba_half];
- if (task.shader_input)
- subtask.shader_input = find_matching_mem(task.shader_input, sub);
- if (task.shader_output)
- subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
- sub.device->task_add(subtask);
-
- if (task.buffers && task.buffers->buffer.device == this) {
- /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
- sub.device->task_wait();
- }
- }
- }
- }
-
- void task_wait() override
- {
- foreach (SubDevice &sub, devices)
- sub.device->task_wait();
- foreach (SubDevice &sub, denoising_devices)
- sub.device->task_wait();
- }
-
- void task_cancel() override
- {
- foreach (SubDevice &sub, devices)
- sub.device->task_cancel();
- foreach (SubDevice &sub, denoising_devices)
- sub.device->task_cancel();
- }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
- for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
- if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
- return it;
- return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
- boost::asio::io_service io_service;
- tcp::socket socket;
- device_ptr mem_counter;
- DeviceTask the_task; /* todo: handle multiple tasks */
-
- thread_mutex rpc_lock;
-
- virtual bool show_samples() const
- {
- return false;
- }
-
- NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
- : Device(info, stats, profiler, true), socket(io_service)
- {
- error_func = NetworkError();
- stringstream portstr;
- portstr << SERVER_PORT;
-
- tcp::resolver resolver(io_service);
- tcp::resolver::query query(address, portstr.str());
- tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
- tcp::resolver::iterator end;
-
- boost::system::error_code error = boost::asio::error::host_not_found;
- while (error && endpoint_iterator != end) {
- socket.close();
- socket.connect(*endpoint_iterator++, error);
- }
-
- if (error)
- error_func.network_error(error.message());
-
- mem_counter = 0;
- }
-
- ~NetworkDevice()
- {
- RPCSend snd(socket, &error_func, "stop");
- snd.write();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const
- {
- return BVH_LAYOUT_BVH2;
- }
-
- void mem_alloc(device_memory &mem)
- {
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- thread_scoped_lock lock(rpc_lock);
-
- mem.device_pointer = ++mem_counter;
-
- RPCSend snd(socket, &error_func, "mem_alloc");
- snd.add(mem);
- snd.write();
- }
-
- void mem_copy_to(device_memory &mem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_copy_to");
-
- snd.add(mem);
- snd.write();
- snd.write_buffer(mem.host_pointer, mem.memory_size());
- }
-
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- size_t data_size = mem.memory_size();
-
- RPCSend snd(socket, &error_func, "mem_copy_from");
-
- snd.add(mem);
- snd.add(y);
- snd.add(w);
- snd.add(h);
- snd.add(elem);
- snd.write();
-
- RPCReceive rcv(socket, &error_func);
- rcv.read_buffer(mem.host_pointer, data_size);
- }
-
- void mem_zero(device_memory &mem)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_zero");
-
- snd.add(mem);
- snd.write();
- }
-
- void mem_free(device_memory &mem)
- {
- if (mem.device_pointer) {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_free");
-
- snd.add(mem);
- snd.write();
-
- mem.device_pointer = 0;
- }
- }
-
- void const_copy_to(const char *name, void *host, size_t size)
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "const_copy_to");
-
- string name_string(name);
-
- snd.add(name_string);
- snd.add(size);
- snd.write();
- snd.write_buffer(host, size);
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features)
- {
- if (error_func.have_error())
- return false;
-
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(requested_features.experimental);
- snd.add(requested_features.max_closure);
- snd.add(requested_features.max_nodes_group);
- snd.add(requested_features.nodes_features);
- snd.write();
-
- bool result;
- RPCReceive rcv(socket, &error_func);
- rcv.read(result);
-
- return result;
- }
-
- void task_add(DeviceTask &task)
- {
- thread_scoped_lock lock(rpc_lock);
-
- the_task = task;
-
- RPCSend snd(socket, &error_func, "task_add");
- snd.add(task);
- snd.write();
- }
-
- void task_wait()
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "task_wait");
- snd.write();
-
- lock.unlock();
-
- TileList the_tiles;
-
- /* todo: run this threaded for connecting to multiple clients */
- for (;;) {
- if (error_func.have_error())
- break;
-
- RenderTile tile;
-
- lock.lock();
- RPCReceive rcv(socket, &error_func);
-
- if (rcv.name == "acquire_tile") {
- lock.unlock();
-
- /* todo: watch out for recursive calls! */
- if (the_task.acquire_tile(this, tile)) { /* write return as bool */
- the_tiles.push_back(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
- else {
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile_none");
- snd.write();
- lock.unlock();
- }
- }
- else if (rcv.name == "release_tile") {
- rcv.read(tile);
- lock.unlock();
-
- TileList::iterator it = tile_list_find(the_tiles, tile);
- if (it != the_tiles.end()) {
- tile.buffers = it->buffers;
- the_tiles.erase(it);
- }
-
- assert(tile.buffers != NULL);
-
- the_task.release_tile(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "release_tile");
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_wait_done") {
- lock.unlock();
- break;
- }
- else
- lock.unlock();
- }
- }
-
- void task_cancel()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "task_cancel");
- snd.write();
- }
-
- int get_split_task_count(DeviceTask &)
- {
- return 1;
- }
-
- private:
- NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- const char *address)
-{
- return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
- DeviceInfo info;
-
- info.type = DEVICE_NETWORK;
- info.description = "Network Device";
- info.id = "NETWORK";
- info.num = 0;
-
- /* todo: get this info from device */
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.has_osl = false;
- info.denoisers = DENOISER_NONE;
-
- devices.push_back(info);
-}
-
-class DeviceServer {
- public:
- thread_mutex rpc_lock;
-
- void network_error(const string &message)
- {
- error_func.network_error(message);
- }
-
- bool have_error()
- {
- return error_func.have_error();
- }
-
- DeviceServer(Device *device_, tcp::socket &socket_)
- : device(device_), socket(socket_), stop(false), blocked_waiting(false)
- {
- error_func = NetworkError();
- }
-
- void listen()
- {
- /* receive remote function calls */
- for (;;) {
- listen_step();
-
- if (stop)
- break;
- }
- }
-
- protected:
- void listen_step()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCReceive rcv(socket, &error_func);
-
- if (rcv.name == "stop")
- stop = true;
- else
- process(rcv, lock);
- }
-
- /* create a memory buffer for a device buffer and insert it into mem_data */
- DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
- {
- /* create a new DataVector and insert it into mem_data */
- pair<DataMap::iterator, bool> data_ins = mem_data.insert(
- DataMap::value_type(client_pointer, DataVector()));
-
- /* make sure it was a unique insertion */
- assert(data_ins.second);
-
- /* get a reference to the inserted vector */
- DataVector &data_v = data_ins.first->second;
-
- /* size the vector */
- data_v.resize(data_size);
-
- return data_v;
- }
-
- DataVector &data_vector_find(device_ptr client_pointer)
- {
- DataMap::iterator i = mem_data.find(client_pointer);
- assert(i != mem_data.end());
- return i->second;
- }
-
- /* setup mapping and reverse mapping of client_pointer<->real_pointer */
- void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
- {
- pair<PtrMap::iterator, bool> mapins;
-
- /* insert mapping from client pointer to our real device pointer */
- mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
- assert(mapins.second);
-
- /* insert reverse mapping from real our device pointer to client pointer */
- mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
- assert(mapins.second);
- }
-
- device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
- return i->second;
- }
-
- device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
-
- device_ptr result = i->second;
-
- /* erase the mapping */
- ptr_map.erase(i);
-
- /* erase the reverse mapping */
- PtrMap::iterator irev = ptr_imap.find(result);
- assert(irev != ptr_imap.end());
- ptr_imap.erase(irev);
-
- /* erase the data vector */
- DataMap::iterator idata = mem_data.find(client_pointer);
- assert(idata != mem_data.end());
- mem_data.erase(idata);
-
- return result;
- }
-
- /* note that the lock must be already acquired upon entry.
- * This is necessary because the caller often peeks at
- * the header and delegates control to here when it doesn't
- * specifically handle the current RPC.
- * The lock must be unlocked before returning */
- void process(RPCReceive &rcv, thread_scoped_lock &lock)
- {
- if (rcv.name == "mem_alloc") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- /* Allocate host side data buffer. */
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
- /* Perform the allocation on the actual device. */
- device->mem_alloc(mem);
-
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- else if (rcv.name == "mem_copy_to") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if (client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void *)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
- }
-
- /* Copy data from network into memory buffer. */
- rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
- /* Copy the data from the memory buffer to the device buffer. */
- device->mem_copy_to(mem);
-
- if (!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if (rcv.name == "mem_copy_from") {
- string name;
- network_device_memory mem(device);
- int y, w, h, elem;
-
- rcv.read(mem, name);
- rcv.read(y);
- rcv.read(w);
- rcv.read(h);
- rcv.read(elem);
-
- device_ptr client_pointer = mem.device_pointer;
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
- DataVector &data_v = data_vector_find(client_pointer);
-
- mem.host_pointer = (device_ptr) & (data_v[0]);
-
- device->mem_copy_from(mem, y, w, h, elem);
-
- size_t data_size = mem.memory_size();
-
- RPCSend snd(socket, &error_func, "mem_copy_from");
- snd.write();
- snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
- lock.unlock();
- }
- else if (rcv.name == "mem_zero") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if (client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void *)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
- }
-
- /* Zero memory. */
- device->mem_zero(mem);
-
- if (!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if (rcv.name == "mem_free") {
- string name;
- network_device_memory mem(device);
-
- rcv.read(mem, name);
- lock.unlock();
-
- device_ptr client_pointer = mem.device_pointer;
-
- mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
- device->mem_free(mem);
- }
- else if (rcv.name == "const_copy_to") {
- string name_string;
- size_t size;
-
- rcv.read(name_string);
- rcv.read(size);
-
- vector<char> host_vector(size);
- rcv.read_buffer(&host_vector[0], size);
- lock.unlock();
-
- device->const_copy_to(name_string.c_str(), &host_vector[0], size);
- }
- else if (rcv.name == "load_kernels") {
- DeviceRequestedFeatures requested_features;
- rcv.read(requested_features.experimental);
- rcv.read(requested_features.max_closure);
- rcv.read(requested_features.max_nodes_group);
- rcv.read(requested_features.nodes_features);
-
- bool result;
- result = device->load_kernels(requested_features);
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(result);
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_add") {
- DeviceTask task;
-
- rcv.read(task);
- lock.unlock();
-
- if (task.buffer)
- task.buffer = device_ptr_from_client_pointer(task.buffer);
-
- if (task.rgba_half)
- task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
- if (task.rgba_byte)
- task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
- if (task.shader_input)
- task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
- if (task.shader_output)
- task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
- task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
- task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
- task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
- this);
- task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
- task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
- device->task_add(task);
- }
- else if (rcv.name == "task_wait") {
- lock.unlock();
-
- blocked_waiting = true;
- device->task_wait();
- blocked_waiting = false;
-
- lock.lock();
- RPCSend snd(socket, &error_func, "task_wait_done");
- snd.write();
- lock.unlock();
- }
- else if (rcv.name == "task_cancel") {
- lock.unlock();
- device->task_cancel();
- }
- else if (rcv.name == "acquire_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- rcv.read(entry.tile);
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if (rcv.name == "acquire_tile_none") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if (rcv.name == "release_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else {
- cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
- lock.unlock();
- }
- }
-
- bool task_acquire_tile(Device *, RenderTile &tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- bool result = false;
-
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.write();
-
- do {
- if (blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if (!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if (entry.name == "acquire_tile") {
- tile = entry.tile;
-
- if (tile.buffer)
- tile.buffer = ptr_map[tile.buffer];
-
- result = true;
- break;
- }
- else if (entry.name == "acquire_tile_none") {
- break;
- }
- else {
- cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while (acquire_queue.empty() && !stop && !have_error());
-
- return result;
- }
-
- void task_update_progress_sample()
- {
- ; /* skip */
- }
-
- void task_update_tile_sample(RenderTile &)
- {
- ; /* skip */
- }
-
- void task_release_tile(RenderTile &tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- if (tile.buffer)
- tile.buffer = ptr_imap[tile.buffer];
-
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "release_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
-
- do {
- if (blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if (!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if (entry.name == "release_tile") {
- lock.unlock();
- break;
- }
- else {
- cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while (acquire_queue.empty() && !stop);
- }
-
- bool task_get_cancel()
- {
- return false;
- }
-
- /* properties */
- Device *device;
- tcp::socket &socket;
-
- /* mapping of remote to local pointer */
- PtrMap ptr_map;
- PtrMap ptr_imap;
- DataMap mem_data;
-
- struct AcquireEntry {
- string name;
- RenderTile tile;
- };
-
- thread_mutex acquire_mutex;
- list<AcquireEntry> acquire_queue;
-
- bool stop;
- bool blocked_waiting;
-
- private:
- NetworkError error_func;
-
- /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
- try {
- /* starts thread that responds to discovery requests */
- ServerDiscovery discovery;
-
- for (;;) {
- /* accept connection */
- boost::asio::io_service io_service;
- tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
- tcp::socket socket(io_service);
- acceptor.accept(socket);
-
- string remote_address = socket.remote_endpoint().address().to_string();
- printf("Connected to remote client at: %s\n", remote_address.c_str());
-
- DeviceServer server(this, socket);
- server.listen();
-
- printf("Disconnected.\n");
- }
- }
- catch (exception &e) {
- fprintf(stderr, "Network server exception: %s\n", e.what());
- }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-# include <boost/archive/binary_iarchive.hpp>
-# include <boost/archive/binary_oarchive.hpp>
-# include <boost/archive/text_iarchive.hpp>
-# include <boost/archive/text_oarchive.hpp>
-# include <boost/array.hpp>
-# include <boost/asio.hpp>
-# include <boost/bind.hpp>
-# include <boost/serialization/vector.hpp>
-# include <boost/thread.hpp>
-
-# include <deque>
-# include <iostream>
-# include <sstream>
-
-# include "render/buffers.h"
-
-# include "util/util_foreach.h"
-# include "util/util_list.h"
-# include "util/util_map.h"
-# include "util/util_param.h"
-# include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-# if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-# else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-# endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
- network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
- {
- }
-
- ~network_device_memory()
- {
- device_pointer = 0;
- };
-
- vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
- NetworkError()
- {
- error = "";
- error_count = 0;
- }
-
- ~NetworkError()
- {
- }
-
- void network_error(const string &message)
- {
- error = message;
- error_count += 1;
- }
-
- bool have_error()
- {
- return true ? error_count > 0 : false;
- }
-
- private:
- string error;
- int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
- RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
- : name(name_), socket(socket_), archive(archive_stream), sent(false)
- {
- archive &name_;
- error_func = e;
- fprintf(stderr, "rpc send %s\n", name.c_str());
- }
-
- ~RPCSend()
- {
- }
-
- void add(const device_memory &mem)
- {
- archive &mem.data_type &mem.data_elements &mem.data_size;
- archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
- archive &mem.type &string(mem.name);
- archive &mem.interpolation &mem.extension;
- archive &mem.device_pointer;
- }
-
- template<typename T> void add(const T &data)
- {
- archive &data;
- }
-
- void add(const DeviceTask &task)
- {
- int type = (int)task.type;
- archive &type &task.x &task.y &task.w &task.h;
- archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
- archive &task.offset &task.stride;
- archive &task.shader_input &task.shader_output &task.shader_eval_type;
- archive &task.shader_x &task.shader_w;
- archive &task.need_finish_queue;
- }
-
- void add(const RenderTile &tile)
- {
- archive &tile.x &tile.y &tile.w &tile.h;
- archive &tile.start_sample &tile.num_samples &tile.sample;
- archive &tile.resolution &tile.offset &tile.stride;
- archive &tile.buffer;
- }
-
- void write()
- {
- boost::system::error_code error;
-
- /* get string from stream */
- string archive_str = archive_stream.str();
-
- /* first send fixed size header with size of following data */
- ostringstream header_stream;
- header_stream << setw(8) << hex << archive_str.size();
- string header_str = header_stream.str();
-
- boost::asio::write(
- socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- /* then send actual data */
- boost::asio::write(
- socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- sent = true;
- }
-
- void write_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
-
- boost::asio::write(
- socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
- if (error.value())
- error_func->network_error(error.message());
- }
-
- protected:
- string name;
- tcp::socket &socket;
- ostringstream archive_stream;
- o_archive archive;
- bool sent;
- NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
- RPCReceive(tcp::socket &socket_, NetworkError *e)
- : socket(socket_), archive_stream(NULL), archive(NULL)
- {
- error_func = e;
- /* read head with fixed size */
- vector<char> header(8);
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
- if (error.value()) {
- error_func->network_error(error.message());
- }
-
- /* verify if we got something */
- if (len == header.size()) {
- /* decode header */
- string header_str(&header[0], header.size());
- istringstream header_stream(header_str);
-
- size_t data_size;
-
- if ((header_stream >> hex >> data_size)) {
-
- vector<char> data(data_size);
- size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
- if (error.value())
- error_func->network_error(error.message());
-
- if (len == data_size) {
- archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
- archive_stream = new istringstream(archive_str);
- archive = new i_archive(*archive_stream);
-
- *archive &name;
- fprintf(stderr, "rpc receive %s\n", name.c_str());
- }
- else {
- error_func->network_error("Network receive error: data size doesn't match header");
- }
- }
- else {
- error_func->network_error("Network receive error: can't decode data size from header");
- }
- }
- else {
- error_func->network_error("Network receive error: invalid header size");
- }
- }
-
- ~RPCReceive()
- {
- delete archive;
- delete archive_stream;
- }
-
- void read(network_device_memory &mem, string &name)
- {
- *archive &mem.data_type &mem.data_elements &mem.data_size;
- *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
- *archive &mem.type &name;
- *archive &mem.interpolation &mem.extension;
- *archive &mem.device_pointer;
-
- mem.name = name.c_str();
- mem.host_pointer = 0;
-
- /* Can't transfer OpenGL texture over network. */
- if (mem.type == MEM_PIXELS) {
- mem.type = MEM_READ_WRITE;
- }
- }
-
- template<typename T> void read(T &data)
- {
- *archive &data;
- }
-
- void read_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
- if (error.value()) {
- error_func->network_error(error.message());
- }
-
- if (len != size)
- cout << "Network receive error: buffer size doesn't match expected size\n";
- }
-
- void read(DeviceTask &task)
- {
- int type;
-
- *archive &type &task.x &task.y &task.w &task.h;
- *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
- *archive &task.offset &task.stride;
- *archive &task.shader_input &task.shader_output &task.shader_eval_type;
- *archive &task.shader_x &task.shader_w;
- *archive &task.need_finish_queue;
-
- task.type = (DeviceTask::Type)type;
- }
-
- void read(RenderTile &tile)
- {
- *archive &tile.x &tile.y &tile.w &tile.h;
- *archive &tile.start_sample &tile.num_samples &tile.sample;
- *archive &tile.resolution &tile.offset &tile.stride;
- *archive &tile.buffer;
-
- tile.buffers = NULL;
- }
-
- string name;
-
- protected:
- tcp::socket &socket;
- string archive_str;
- istringstream *archive_stream;
- i_archive *archive;
- NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
- explicit ServerDiscovery(bool discover = false)
- : listen_socket(io_service), collect_servers(false)
- {
- /* setup listen socket */
- listen_endpoint.address(boost::asio::ip::address_v4::any());
- listen_endpoint.port(DISCOVER_PORT);
-
- listen_socket.open(listen_endpoint.protocol());
-
- boost::asio::socket_base::reuse_address option(true);
- listen_socket.set_option(option);
-
- listen_socket.bind(listen_endpoint);
-
- /* setup receive callback */
- async_receive();
-
- /* start server discovery */
- if (discover) {
- collect_servers = true;
- servers.clear();
-
- broadcast_message(DISCOVER_REQUEST_MSG);
- }
-
- /* start thread */
- work = new boost::asio::io_service::work(io_service);
- thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
- }
-
- ~ServerDiscovery()
- {
- io_service.stop();
- thread->join();
- delete thread;
- delete work;
- }
-
- vector<string> get_server_list()
- {
- vector<string> result;
-
- mutex.lock();
- result = vector<string>(servers.begin(), servers.end());
- mutex.unlock();
-
- return result;
- }
-
- private:
- void handle_receive_from(const boost::system::error_code &error, size_t size)
- {
- if (error) {
- cout << "Server discovery receive error: " << error.message() << "\n";
- return;
- }
-
- if (size > 0) {
- string msg = string(receive_buffer, size);
-
- /* handle incoming message */
- if (collect_servers) {
- if (msg == DISCOVER_REPLY_MSG) {
- string address = receive_endpoint.address().to_string();
-
- mutex.lock();
-
- /* add address if it's not already in the list */
- bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
- if (!found)
- servers.push_back(address);
-
- mutex.unlock();
- }
- }
- else {
- /* reply to request */
- if (msg == DISCOVER_REQUEST_MSG)
- broadcast_message(DISCOVER_REPLY_MSG);
- }
- }
-
- async_receive();
- }
-
- void async_receive()
- {
- listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
- receive_endpoint,
- boost::bind(&ServerDiscovery::handle_receive_from,
- this,
- boost::asio::placeholders::error,
- boost::asio::placeholders::bytes_transferred));
- }
-
- void broadcast_message(const string &msg)
- {
- /* setup broadcast socket */
- boost::asio::ip::udp::socket socket(io_service);
-
- socket.open(boost::asio::ip::udp::v4());
-
- boost::asio::socket_base::broadcast option(true);
- socket.set_option(option);
-
- boost::asio::ip::udp::endpoint broadcast_endpoint(
- boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
- /* broadcast message */
- socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
- }
-
- /* network service and socket */
- boost::asio::io_service io_service;
- boost::asio::ip::udp::endpoint listen_endpoint;
- boost::asio::ip::udp::socket listen_socket;
-
- /* threading */
- boost::thread *thread;
- boost::asio::io_service::work *work;
- boost::mutex mutex;
-
- /* buffer and endpoint for receiving messages */
- char receive_buffer[256];
- boost::asio::ip::udp::endpoint receive_endpoint;
-
- // os, version, devices, status, host name, group name, ip as far as fields go
- struct ServerInfo {
- string cycles_version;
- string os;
- int device_count;
- string status;
- string host_name;
- string group_name;
- string host_addr;
- };
-
- /* collection of server addresses in list */
- bool collect_servers;
- vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/opencl/device_opencl.h"
-# include "device/device.h"
-# include "device/device_intern.h"
-
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_set.h"
-# include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
- static bool initialized = false;
- static bool result = false;
-
- if (initialized)
- return result;
-
- initialized = true;
-
- if (OpenCLInfo::device_type() != 0) {
- int clew_result = clewInit();
- if (clew_result == CLEW_SUCCESS) {
- VLOG(1) << "CLEW initialization succeeded.";
- result = true;
- }
- else {
- VLOG(1) << "CLEW initialization failed: "
- << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
- "Error opening the library");
- }
- }
- else {
- VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
- result = false;
- }
-
- return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-# ifdef _WIN32
- __try {
- return clGetPlatformIDs(0, NULL, num_platforms);
- }
- __except (EXCEPTION_EXECUTE_HANDLER) {
- /* Ignore crashes inside the OpenCL driver and hope we can
- * survive even with corrupted OpenCL installs. */
- fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
- }
-
- *num_platforms = 0;
- return CL_DEVICE_NOT_FOUND;
-# else
- return clGetPlatformIDs(0, NULL, num_platforms);
-# endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
- cl_uint num_platforms = 0;
- device_opencl_get_num_platforms_safe(&num_platforms);
- if (num_platforms == 0) {
- return;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- /* Devices are numbered consecutively across platforms. */
- int num_devices = 0;
- set<string> unique_ids;
- foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
- /* Compute unique ID for persistent user preferences. */
- const string &platform_name = platform_device.platform_name;
- const string &device_name = platform_device.device_name;
- string hardware_id = platform_device.hardware_id;
- if (hardware_id == "") {
- hardware_id = string_printf("ID_%d", num_devices);
- }
- string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
- /* Hardware ID might not be unique, add device number in that case. */
- if (unique_ids.find(id) != unique_ids.end()) {
- id += string_printf("_ID_%d", num_devices);
- }
- unique_ids.insert(id);
-
- /* Create DeviceInfo. */
- DeviceInfo info;
- info.type = DEVICE_OPENCL;
- info.description = string_remove_trademark(string(device_name));
- info.num = num_devices;
- /* We don't know if it's used for display, but assume it is. */
- info.display_device = true;
- info.use_split_kernel = true;
- info.has_volume_decoupled = false;
- info.has_adaptive_stop_per_sample = false;
- info.denoisers = DENOISER_NLM;
- info.id = id;
-
- /* Check OpenCL extensions */
- info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
- /* Disabled for now due to apparent AMD driver bug. */
- info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
- devices.push_back(info);
- num_devices++;
- }
-}
-
-string device_opencl_capabilities()
-{
- if (OpenCLInfo::device_type() == 0) {
- return "All OpenCL devices are forced to be OFF";
- }
- string result = "";
- string error_msg = ""; /* Only used by opencl_assert(), but in the future
- * it could also be nicely reported to the console.
- */
- cl_uint num_platforms = 0;
- opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
- if (num_platforms == 0) {
- return "No OpenCL platforms found\n";
- }
- result += string_printf("Number of platforms: %u\n", num_platforms);
-
- vector<cl_platform_id> platform_ids;
- platform_ids.resize(num_platforms);
- opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-# define APPEND_INFO(func, id, name, what, type) \
- do { \
- type data; \
- memset(&data, 0, sizeof(data)); \
- opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
- result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
- } while (false)
-# define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
- do { \
- string value; \
- size_t length = 0; \
- if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
- vector<char> buffer(length + 1); \
- if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
- value = string(buffer.data()); \
- } \
- } \
- if (is_optional && !(length != 0 && value[0] != '\0')) { \
- break; \
- } \
- result += string_printf("%s: %s\n", name, value.c_str()); \
- } while (false)
-# define APPEND_PLATFORM_STRING_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-# define APPEND_PLATFORM_INFO(id, name, what, type) \
- APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-# define APPEND_DEVICE_INFO(id, name, what, type) \
- APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-# define APPEND_DEVICE_STRING_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
- APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
- vector<cl_device_id> device_ids;
- for (cl_uint platform = 0; platform < num_platforms; ++platform) {
- cl_platform_id platform_id = platform_ids[platform];
-
- result += string_printf("Platform #%u\n", platform);
-
- APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
- APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
- cl_uint num_devices = 0;
- opencl_assert(
- clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
- result += string_printf("\tNumber of devices: %u\n", num_devices);
-
- device_ids.resize(num_devices);
- opencl_assert(clGetDeviceIDs(
- platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
- for (cl_uint device = 0; device < num_devices; ++device) {
- cl_device_id device_id = device_ids[device];
-
- result += string_printf("\t\tDevice: #%u\n", device);
-
- APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
- APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
- APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
- APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
- APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
- APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
- APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
- APPEND_DEVICE_INFO(
- device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
- }
- }
-
-# undef APPEND_INFO
-# undef APPEND_STRING_INFO_IMPL
-# undef APPEND_PLATFORM_STRING_INFO
-# undef APPEND_STRING_EXTENSION_INFO
-# undef APPEND_PLATFORM_INFO
-# undef APPEND_DEVICE_INFO
-# undef APPEND_DEVICE_STRING_INFO
-# undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
- return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-# include "bvh/bvh.h"
-# include "bvh/bvh_optix.h"
-# include "device/cuda/device_cuda.h"
-# include "device/device_denoising.h"
-# include "device/device_intern.h"
-# include "render/buffers.h"
-# include "render/hair.h"
-# include "render/mesh.h"
-# include "render/object.h"
-# include "render/scene.h"
-# include "util/util_debug.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_progress.h"
-# include "util/util_time.h"
-
-# ifdef WITH_CUDA_DYNLOAD
-# include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-# define OPTIX_DONT_INCLUDE_CUDA
-# endif
-# include <optix_function_table_definition.h>
-# include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
- uint4 *input;
- float4 *output;
- int type;
- int filter;
- int sx;
- int offset;
- int sample;
-};
-struct KernelParams {
- WorkTile tile;
- KernelData data;
- ShaderParams shader;
-# define KERNEL_TEX(type, name) const type *name;
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
-};
-
-# define check_result_cuda(stmt) \
- { \
- CUresult res = stmt; \
- if (res != CUDA_SUCCESS) { \
- const char *name; \
- cuGetErrorName(res, &name); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return; \
- } \
- } \
- (void)0
-# define check_result_cuda_ret(stmt) \
- { \
- CUresult res = stmt; \
- if (res != CUDA_SUCCESS) { \
- const char *name; \
- cuGetErrorName(res, &name); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return false; \
- } \
- } \
- (void)0
-
-# define check_result_optix(stmt) \
- { \
- enum OptixResult res = stmt; \
- if (res != OPTIX_SUCCESS) { \
- const char *name = optixGetErrorName(res); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return; \
- } \
- } \
- (void)0
-# define check_result_optix_ret(stmt) \
- { \
- enum OptixResult res = stmt; \
- if (res != OPTIX_SUCCESS) { \
- const char *name = optixGetErrorName(res); \
- set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
- return false; \
- } \
- } \
- (void)0
-
-# define launch_filter_kernel(func_name, w, h, args) \
- { \
- CUfunction func; \
- check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
- check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
- int threads; \
- check_result_cuda_ret( \
- cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- threads = (int)sqrt((float)threads); \
- int xblocks = ((w) + threads - 1) / threads; \
- int yblocks = ((h) + threads - 1) / threads; \
- check_result_cuda_ret( \
- cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
- } \
- (void)0
-
-class OptiXDevice : public CUDADevice {
-
- // List of OptiX program groups
- enum {
- PG_RGEN,
- PG_MISS,
- PG_HITD, // Default hit group
- PG_HITS, // __SHADOW_RECORD_ALL__ hit group
- PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles)
-# if OPTIX_ABI_VERSION >= 36
- PG_HITD_MOTION,
- PG_HITS_MOTION,
-# endif
- PG_BAKE, // kernel_bake_evaluate
- PG_DISP, // kernel_displace_evaluate
- PG_BACK, // kernel_background_evaluate
- PG_CALL,
- NUM_PROGRAM_GROUPS = PG_CALL + 3
- };
-
- // List of OptiX pipelines
- enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
- // A single shader binding table entry
- struct SbtRecord {
- char header[OPTIX_SBT_RECORD_HEADER_SIZE];
- };
-
- // Information stored about CUDA memory allocations
- struct CUDAMem {
- bool free_map_host = false;
- CUarray array = NULL;
- CUtexObject texobject = 0;
- bool use_mapped_host = false;
- };
-
- // Helper class to manage current CUDA context
- struct CUDAContextScope {
- CUDAContextScope(CUcontext ctx)
- {
- cuCtxPushCurrent(ctx);
- }
- ~CUDAContextScope()
- {
- cuCtxPopCurrent(NULL);
- }
- };
-
- // Use a pool with multiple threads to support launches with multiple CUDA streams
- TaskPool task_pool;
-
- vector<CUstream> cuda_stream;
- OptixDeviceContext context = NULL;
-
- OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module
- OptixModule builtin_modules[2] = {};
- OptixPipeline pipelines[NUM_PIPELINES] = {};
-
- bool motion_blur = false;
- device_vector<SbtRecord> sbt_data;
- device_only_memory<KernelParams> launch_params;
- OptixTraversableHandle tlas_handle = 0;
-
- OptixDenoiser denoiser = NULL;
- device_only_memory<unsigned char> denoiser_state;
- int denoiser_input_passes = 0;
-
- vector<device_only_memory<char>> delayed_free_bvh_memory;
- thread_mutex delayed_free_bvh_mutex;
-
- public:
- OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : CUDADevice(info_, stats_, profiler_, background_),
- sbt_data(this, "__sbt", MEM_READ_ONLY),
- launch_params(this, "__params", false),
- denoiser_state(this, "__denoiser_state", true)
- {
- // Store number of CUDA streams in device info
- info.cpu_threads = DebugFlags().optix.cuda_streams;
-
- // Make the CUDA context current
- if (!cuContext) {
- return; // Do not initialize if CUDA context creation failed already
- }
- const CUDAContextScope scope(cuContext);
-
- // Create OptiX context for this device
- OptixDeviceContextOptions options = {};
-# ifdef WITH_CYCLES_LOGGING
- options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
- options.logCallbackFunction =
- [](unsigned int level, const char *, const char *message, void *) {
- switch (level) {
- case 1:
- LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
- break;
- case 2:
- LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
- break;
- case 3:
- LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
- break;
- case 4:
- LOG_IF(INFO, VLOG_IS_ON(1)) << message;
- break;
- }
- };
-# endif
- check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-# ifdef WITH_CYCLES_LOGGING
- check_result_optix(optixDeviceContextSetLogCallback(
- context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-# endif
-
- // Create launch streams
- cuda_stream.resize(info.cpu_threads);
- for (int i = 0; i < info.cpu_threads; ++i)
- check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
- // Fix weird compiler bug that assigns wrong size
- launch_params.data_elements = sizeof(KernelParams);
- // Allocate launch parameter buffer memory on device
- launch_params.alloc_to_device(info.cpu_threads);
- }
- ~OptiXDevice()
- {
- // Stop processing any more tasks
- task_pool.cancel();
-
- // Make CUDA context current
- const CUDAContextScope scope(cuContext);
-
- free_bvh_memory_delayed();
-
- sbt_data.free();
- texture_info.free();
- launch_params.free();
- denoiser_state.free();
-
- // Unload modules
- if (optix_module != NULL)
- optixModuleDestroy(optix_module);
- for (unsigned int i = 0; i < 2; ++i)
- if (builtin_modules[i] != NULL)
- optixModuleDestroy(builtin_modules[i]);
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
- if (pipelines[i] != NULL)
- optixPipelineDestroy(pipelines[i]);
-
- // Destroy launch streams
- for (CUstream stream : cuda_stream)
- cuStreamDestroy(stream);
-
- if (denoiser != NULL)
- optixDenoiserDestroy(denoiser);
-
- optixDeviceContextDestroy(context);
- }
-
- private:
- bool show_samples() const override
- {
- // Only show samples if not rendering multiple tiles in parallel
- return info.cpu_threads == 1;
- }
-
- BVHLayoutMask get_bvh_layout_mask() const override
- {
- // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
- if (optix_module == NULL)
- return CUDADevice::get_bvh_layout_mask();
-
- // OptiX has its own internal acceleration structure format
- return BVH_LAYOUT_OPTIX;
- }
-
- string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
- bool filter,
- bool /*split*/) override
- {
- // Split kernel is not supported in OptiX
- string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
- requested_features, filter, false);
-
- // Add OptiX SDK include directory to include paths
- const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
- if (optix_sdk_path) {
- common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
- }
-
- // Specialization for shader raytracing
- if (requested_features.use_shader_raytrace) {
- common_cflags += " --keep-device-functions";
- }
- else {
- common_cflags += " -D __NO_SHADER_RAYTRACE__";
- }
-
- return common_cflags;
- }
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features) override
- {
- if (have_error()) {
- // Abort early if context creation failed already
- return false;
- }
-
- // Load CUDA modules because we need some of the utility kernels
- if (!CUDADevice::load_kernels(requested_features)) {
- return false;
- }
-
- // Baking is currently performed using CUDA, so no need to load OptiX kernels
- if (requested_features.use_baking) {
- return true;
- }
-
- const CUDAContextScope scope(cuContext);
-
- // Unload existing OptiX module and pipelines first
- if (optix_module != NULL) {
- optixModuleDestroy(optix_module);
- optix_module = NULL;
- }
- for (unsigned int i = 0; i < 2; ++i) {
- if (builtin_modules[i] != NULL) {
- optixModuleDestroy(builtin_modules[i]);
- builtin_modules[i] = NULL;
- }
- }
- for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
- if (pipelines[i] != NULL) {
- optixPipelineDestroy(pipelines[i]);
- pipelines[i] = NULL;
- }
- }
-
- OptixModuleCompileOptions module_options = {};
- module_options.maxRegisterCount = 0; // Do not set an explicit register limit
- module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
- module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-# if OPTIX_ABI_VERSION >= 41
- module_options.boundValues = nullptr;
- module_options.numBoundValues = 0;
-# endif
-
- OptixPipelineCompileOptions pipeline_options = {};
- // Default to no motion blur and two-level graph, since it is the fastest option
- pipeline_options.usesMotionBlur = false;
- pipeline_options.traversableGraphFlags =
- OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
- pipeline_options.numPayloadValues = 6;
- pipeline_options.numAttributeValues = 2; // u, v
- pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
- pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
-
-# if OPTIX_ABI_VERSION >= 36
- pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
- if (requested_features.use_hair) {
- if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
- pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
- }
- else {
- pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
- }
- }
-# endif
-
- // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
- // This is necessary since objects may be reported to have motion if the Vector pass is
- // active, but may still need to be rendered without motion blur if that isn't active as well
- motion_blur = requested_features.use_object_motion;
-
- if (motion_blur) {
- pipeline_options.usesMotionBlur = true;
- // Motion blur can insert motion transforms into the traversal graph
- // It is no longer a two-level graph then, so need to set flags to allow any configuration
- pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
- }
-
- { // Load and compile PTX module with OptiX kernels
- string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
- "lib/kernel_optix_shader_raytrace.ptx" :
- "lib/kernel_optix.ptx");
- if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
- if (!getenv("OPTIX_ROOT_DIR")) {
- set_error(
- "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
- "the Optix SDK to be able to compile Optix kernels on demand).");
- return false;
- }
- ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
- }
- if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
- set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
- return false;
- }
-
- check_result_optix_ret(optixModuleCreateFromPTX(context,
- &module_options,
- &pipeline_options,
- ptx_data.data(),
- ptx_data.size(),
- nullptr,
- 0,
- &optix_module));
- }
-
- // Create program groups
- OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
- OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
- OptixProgramGroupOptions group_options = {}; // There are no options currently
- group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_RGEN].raygen.module = optix_module;
- // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
- group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
- group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
- group_descs[PG_MISS].miss.module = optix_module;
- group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
- group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
- group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
- group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
- group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
- if (requested_features.use_hair) {
- group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
- group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
- // Add curve intersection programs
- if (requested_features.use_hair_thick) {
- // Slower programs for thick hair since that also slows down ribbons.
- // Ideally this should not be needed.
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
- }
- else {
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
- }
-
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
- OptixBuiltinISOptions builtin_options = {};
- builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
- builtin_options.usesMotionBlur = false;
-
- check_result_optix_ret(optixBuiltinISModuleGet(
- context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
- group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
- group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
- group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
- group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
- if (motion_blur) {
- builtin_options.usesMotionBlur = true;
-
- check_result_optix_ret(optixBuiltinISModuleGet(
- context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
- group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
- group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
- group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
- group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
- }
- }
-# endif
- }
-
- if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
- // Add hit group for local intersections
- group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
- group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
- group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
- }
-
- if (requested_features.use_baking) {
- group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_BAKE].raygen.module = optix_module;
- group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
- }
-
- if (requested_features.use_true_displacement) {
- group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_DISP].raygen.module = optix_module;
- group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
- }
-
- if (requested_features.use_background_light) {
- group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_BACK].raygen.module = optix_module;
- group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
- }
-
- // Shader raytracing replaces some functions with direct callables
- if (requested_features.use_shader_raytrace) {
- group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
- group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
- "__direct_callable__kernel_volume_shadow";
- group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
- group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
- group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
- "__direct_callable__subsurface_scatter_multi_setup";
- }
-
- check_result_optix_ret(optixProgramGroupCreate(
- context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
- // Get program stack sizes
- OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
- // Set up SBT, which in this case is used only to select between different programs
- sbt_data.alloc(NUM_PROGRAM_GROUPS);
- memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
- check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
- check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
- }
- sbt_data.copy_to_device(); // Upload SBT to device
-
- // Calculate maximum trace continuation stack size
- unsigned int trace_css = stack_size[PG_HITD].cssCH;
- // This is based on the maximum of closest-hit and any-hit/intersection programs
- trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
- trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
- trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-# if OPTIX_ABI_VERSION >= 36
- trace_css = std::max(trace_css,
- stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
- trace_css = std::max(trace_css,
- stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-# endif
-
- OptixPipelineLinkOptions link_options = {};
- link_options.maxTraceDepth = 1;
- link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-# if OPTIX_ABI_VERSION < 24
- link_options.overrideUsesMotionBlur = motion_blur;
-# endif
-
- { // Create path tracing pipeline
- vector<OptixProgramGroup> pipeline_groups;
- pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
-# endif
- if (requested_features.use_shader_raytrace) {
- pipeline_groups.push_back(groups[PG_CALL + 0]);
- pipeline_groups.push_back(groups[PG_CALL + 1]);
- pipeline_groups.push_back(groups[PG_CALL + 2]);
- }
-
- check_result_optix_ret(optixPipelineCreate(context,
- &pipeline_options,
- &link_options,
- pipeline_groups.data(),
- pipeline_groups.size(),
- nullptr,
- 0,
- &pipelines[PIP_PATH_TRACE]));
-
- // Combine ray generation and trace continuation stack size
- const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
- // Max direct callable depth is one of the following, so combine accordingly
- // - __raygen__ -> svm_eval_nodes
- // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
- // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
- const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
- std::max(stack_size[PG_CALL + 1].dssDC,
- stack_size[PG_CALL + 2].dssDC);
-
- // Set stack size depending on pipeline options
- check_result_optix_ret(
- optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
- 0,
- requested_features.use_shader_raytrace ? dss : 0,
- css,
- motion_blur ? 3 : 2));
- }
-
- // Only need to create shader evaluation pipeline if one of these features is used:
- const bool use_shader_eval_pipeline = requested_features.use_baking ||
- requested_features.use_background_light ||
- requested_features.use_true_displacement;
-
- if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
- vector<OptixProgramGroup> pipeline_groups;
- pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_BAKE]);
- pipeline_groups.push_back(groups[PG_DISP]);
- pipeline_groups.push_back(groups[PG_BACK]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
-# endif
- if (requested_features.use_shader_raytrace) {
- pipeline_groups.push_back(groups[PG_CALL + 0]);
- pipeline_groups.push_back(groups[PG_CALL + 1]);
- pipeline_groups.push_back(groups[PG_CALL + 2]);
- }
-
- check_result_optix_ret(optixPipelineCreate(context,
- &pipeline_options,
- &link_options,
- pipeline_groups.data(),
- pipeline_groups.size(),
- nullptr,
- 0,
- &pipelines[PIP_SHADER_EVAL]));
-
- // Calculate continuation stack size based on the maximum of all ray generation stack sizes
- const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
- std::max(stack_size[PG_DISP].cssRG,
- stack_size[PG_BACK].cssRG)) +
- link_options.maxTraceDepth * trace_css;
- const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
- std::max(stack_size[PG_CALL + 1].dssDC,
- stack_size[PG_CALL + 2].dssDC);
-
- check_result_optix_ret(
- optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
- 0,
- requested_features.use_shader_raytrace ? dss : 0,
- css,
- motion_blur ? 3 : 2));
- }
-
- // Clean up program group objects
- for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
- optixProgramGroupDestroy(groups[i]);
- }
-
- return true;
- }
-
- void thread_run(DeviceTask &task, int thread_index) // Main task entry point
- {
- if (have_error())
- return; // Abort early if there was an error previously
-
- if (task.type == DeviceTask::RENDER) {
- if (thread_index != 0) {
- // Only execute denoising in a single thread (see also 'task_add')
- task.tile_types &= ~RenderTile::DENOISE;
- }
-
- RenderTile tile;
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE)
- launch_render(task, tile, thread_index);
- else if (tile.task == RenderTile::BAKE) {
- // Perform baking using CUDA, since it is not currently implemented in OptiX
- device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
- CUDADevice::render(task, tile, work_tiles);
- }
- else if (tile.task == RenderTile::DENOISE)
- launch_denoise(task, tile);
- task.release_tile(tile);
- if (task.get_cancel() && !task.need_finish_queue)
- break; // User requested cancellation
- else if (have_error())
- break; // Abort rendering when encountering an error
- }
- }
- else if (task.type == DeviceTask::SHADER) {
- // CUDA kernels are used when doing baking
- if (optix_module == NULL)
- CUDADevice::shader(task);
- else
- launch_shader_eval(task, thread_index);
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- // Set up a single tile that covers the whole task and denoise it
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- launch_denoise(task, tile);
- }
- }
-
- void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
- {
- assert(thread_index < launch_params.data_size);
-
- // Keep track of total render time of this tile
- const scoped_timer timer(&rtile.buffers->render_time);
-
- WorkTile wtile;
- wtile.x = rtile.x;
- wtile.y = rtile.y;
- wtile.w = rtile.w;
- wtile.h = rtile.h;
- wtile.offset = rtile.offset;
- wtile.stride = rtile.stride;
- wtile.buffer = (float *)rtile.buffer;
-
- const int end_sample = rtile.start_sample + rtile.num_samples;
- // Keep this number reasonable to avoid running into TDRs
- int step_samples = (info.display_device ? 8 : 32);
-
- // Offset into launch params buffer so that streams use separate data
- device_ptr launch_params_ptr = launch_params.device_pointer +
- thread_index * launch_params.data_elements;
-
- const CUDAContextScope scope(cuContext);
-
- for (int sample = rtile.start_sample; sample < end_sample;) {
- // Copy work tile information to device
- wtile.start_sample = sample;
- wtile.num_samples = step_samples;
- if (task.adaptive_sampling.use) {
- wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
- }
- wtile.num_samples = min(wtile.num_samples, end_sample - sample);
- device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
- check_result_cuda(
- cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
- OptixShaderBindingTable sbt_params = {};
- sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
- sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
- sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
- sbt_params.missRecordCount = 1;
- sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
- sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-# if OPTIX_ABI_VERSION >= 36
- sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-# else
- sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
-# endif
- sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
- sbt_params.callablesRecordCount = 3;
- sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
- // Launch the ray generation program
- check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
- cuda_stream[thread_index],
- launch_params_ptr,
- launch_params.data_elements,
- &sbt_params,
- // Launch with samples close to each other for better locality
- wtile.w * wtile.num_samples,
- wtile.h,
- 1));
-
- // Run the adaptive sampling kernels at selected samples aligned to step samples.
- uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
- }
-
- // Wait for launch to finish
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
- // Update current sample, so it is displayed correctly
- sample += wtile.num_samples;
- rtile.sample = sample;
- // Update task progress after the kernel completed rendering
- task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
- if (task.get_cancel() && !task.need_finish_queue)
- return; // Cancel rendering
- }
-
- // Finalize adaptive sampling
- if (task.adaptive_sampling.use) {
- device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
- adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
- task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
- }
- }
-
- bool launch_denoise(DeviceTask &task, RenderTile &rtile)
- {
- // Update current sample (for display and NLM denoising task)
- rtile.sample = rtile.start_sample + rtile.num_samples;
-
- // Make CUDA context current now, since it is used for both denoising tasks
- const CUDAContextScope scope(cuContext);
-
- // Choose between OptiX and NLM denoising
- if (task.denoising.type == DENOISER_OPTIX) {
- // Map neighboring tiles onto this device, indices are as following:
- // Where index 4 is the center tile and index 9 is the target for the result.
- // 0 1 2
- // 3 4 5
- // 6 7 8 9
- RenderTileNeighbors neighbors(rtile);
- task.map_neighbor_tiles(neighbors, this);
- RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
- RenderTile &target_tile = neighbors.target;
- rtile = center_tile; // Tile may have been modified by mapping code
-
- // Calculate size of the tile to denoise (including overlap)
- int4 rect = center_tile.bounds();
- // Overlap between tiles has to be at least 64 pixels
- // TODO(pmours): Query this value from OptiX
- rect = rect_expand(rect, 64);
- int4 clip_rect = neighbors.bounds();
- rect = rect_clip(rect, clip_rect);
- int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
- int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
- // Calculate byte offsets and strides
- int pixel_stride = task.pass_stride * (int)sizeof(float);
- int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
- const int pass_offset[3] = {
- (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
- (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
- (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
- // Start with the current tile pointer offset
- int input_stride = pixel_stride;
- device_ptr input_ptr = rtile.buffer + pixel_offset;
-
- // Copy tile data into a common buffer if necessary
- device_only_memory<float> input(this, "denoiser input", true);
- device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
- bool contiguous_memory = true;
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
- contiguous_memory = false;
- }
- }
-
- if (contiguous_memory) {
- // Tiles are in continous memory, so can just subtract overlap offset
- input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
- // Stride covers the whole width of the image and not just a single tile
- input_stride *= rtile.stride;
- }
- else {
- // Adjacent tiles are in separate memory regions, so need to copy them into a single one
- input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
- // Start with the new input buffer
- input_ptr = input.device_pointer;
- // Stride covers the width of the new input buffer, which includes tile width and overlap
- input_stride *= rect_size.x;
-
- TileInfo *tile_info = tile_info_mem.alloc(1);
- for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
- tile_info->offsets[i] = neighbors.tiles[i].offset;
- tile_info->strides[i] = neighbors.tiles[i].stride;
- tile_info->buffers[i] = neighbors.tiles[i].buffer;
- }
- tile_info->x[0] = neighbors.tiles[3].x;
- tile_info->x[1] = neighbors.tiles[4].x;
- tile_info->x[2] = neighbors.tiles[5].x;
- tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
- tile_info->y[0] = neighbors.tiles[1].y;
- tile_info->y[1] = neighbors.tiles[4].y;
- tile_info->y[2] = neighbors.tiles[7].y;
- tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
- tile_info_mem.copy_to_device();
-
- void *args[] = {
- &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
- launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
- }
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
- input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
- void *input_args[] = {&input_rgb.device_pointer,
- &input_ptr,
- &rect_size.x,
- &rect_size.y,
- &input_stride,
- &task.pass_stride,
- const_cast<int *>(pass_offset),
- &task.denoising.input_passes,
- &rtile.sample};
- launch_filter_kernel(
- "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
- input_ptr = input_rgb.device_pointer;
- pixel_stride = 3 * sizeof(float);
- input_stride = rect_size.x * pixel_stride;
-# endif
-
- const bool recreate_denoiser = (denoiser == NULL) ||
- (task.denoising.input_passes != denoiser_input_passes);
- if (recreate_denoiser) {
- // Destroy existing handle before creating new one
- if (denoiser != NULL) {
- optixDenoiserDestroy(denoiser);
- }
-
- // Create OptiX denoiser handle on demand when it is first used
- OptixDenoiserOptions denoiser_options = {};
- assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-# if OPTIX_ABI_VERSION >= 47
- denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
- denoiser_options.guideNormal = task.denoising.input_passes >= 3;
- check_result_optix_ret(optixDenoiserCreate(
- context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-# else
- denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
- OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-# if OPTIX_ABI_VERSION < 28
- denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-# endif
- check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
- check_result_optix_ret(
- optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-# endif
-
- // OptiX denoiser handle was created with the requested number of input passes
- denoiser_input_passes = task.denoising.input_passes;
- }
-
- OptixDenoiserSizes sizes = {};
- check_result_optix_ret(
- optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-# if OPTIX_ABI_VERSION < 28
- const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-# else
- const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-# endif
- const size_t scratch_offset = sizes.stateSizeInBytes;
-
- // Allocate denoiser state if tile size has changed since last setup
- if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
- denoiser_state.data_height != rect_size.y)) {
- denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
- // Initialize denoiser state for the current tile size
- check_result_optix_ret(optixDenoiserSetup(denoiser,
- 0,
- rect_size.x,
- rect_size.y,
- denoiser_state.device_pointer,
- scratch_offset,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-
- denoiser_state.data_width = rect_size.x;
- denoiser_state.data_height = rect_size.y;
- }
-
- // Set up input and output layer information
- OptixImage2D input_layers[3] = {};
- OptixImage2D output_layers[1] = {};
-
- for (int i = 0; i < 3; ++i) {
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-# else
- input_layers[i].data = input_ptr + pass_offset[i];
-# endif
- input_layers[i].width = rect_size.x;
- input_layers[i].height = rect_size.y;
- input_layers[i].rowStrideInBytes = input_stride;
- input_layers[i].pixelStrideInBytes = pixel_stride;
- input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
- }
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- output_layers[0].data = input_ptr;
- output_layers[0].width = rect_size.x;
- output_layers[0].height = rect_size.y;
- output_layers[0].rowStrideInBytes = input_stride;
- output_layers[0].pixelStrideInBytes = pixel_stride;
- int2 output_offset = overlap_offset;
- overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
-# else
- output_layers[0].data = target_tile.buffer + pixel_offset;
- output_layers[0].width = target_tile.w;
- output_layers[0].height = target_tile.h;
- output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
- output_layers[0].pixelStrideInBytes = pixel_stride;
-# endif
- output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-# if OPTIX_ABI_VERSION >= 47
- OptixDenoiserLayer image_layers = {};
- image_layers.input = input_layers[0];
- image_layers.output = output_layers[0];
-
- OptixDenoiserGuideLayer guide_layers = {};
- guide_layers.albedo = input_layers[1];
- guide_layers.normal = input_layers[2];
-# endif
-
- // Finally run denonising
- OptixDenoiserParams params = {}; // All parameters are disabled/zero
-# if OPTIX_ABI_VERSION >= 47
- check_result_optix_ret(optixDenoiserInvoke(denoiser,
- NULL,
- &params,
- denoiser_state.device_pointer,
- scratch_offset,
- &guide_layers,
- &image_layers,
- 1,
- overlap_offset.x,
- overlap_offset.y,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-# else
- check_result_optix_ret(optixDenoiserInvoke(denoiser,
- NULL,
- &params,
- denoiser_state.device_pointer,
- scratch_offset,
- input_layers,
- task.denoising.input_passes,
- overlap_offset.x,
- overlap_offset.y,
- output_layers,
- denoiser_state.device_pointer + scratch_offset,
- scratch_size));
-# endif
-
-# if OPTIX_DENOISER_NO_PIXEL_STRIDE
- void *output_args[] = {&input_ptr,
- &target_tile.buffer,
- &output_offset.x,
- &output_offset.y,
- &rect_size.x,
- &rect_size.y,
- &target_tile.x,
- &target_tile.y,
- &target_tile.w,
- &target_tile.h,
- &target_tile.offset,
- &target_tile.stride,
- &task.pass_stride,
- &rtile.sample};
- launch_filter_kernel(
- "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-# endif
-
- check_result_cuda_ret(cuStreamSynchronize(0));
-
- task.unmap_neighbor_tiles(neighbors, this);
- }
- else {
- // Run CUDA denoising kernels
- DenoisingTask denoising(this, task);
- CUDADevice::denoise(rtile, denoising);
- }
-
- // Update task progress after the denoiser completed processing
- task.update_progress(&rtile, rtile.w * rtile.h);
-
- return true;
- }
-
- void launch_shader_eval(DeviceTask &task, int thread_index)
- {
- unsigned int rgen_index = PG_BACK;
- if (task.shader_eval_type >= SHADER_EVAL_BAKE)
- rgen_index = PG_BAKE;
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
- rgen_index = PG_DISP;
-
- const CUDAContextScope scope(cuContext);
-
- device_ptr launch_params_ptr = launch_params.device_pointer +
- thread_index * launch_params.data_elements;
-
- for (int sample = 0; sample < task.num_samples; ++sample) {
- ShaderParams params;
- params.input = (uint4 *)task.shader_input;
- params.output = (float4 *)task.shader_output;
- params.type = task.shader_eval_type;
- params.filter = task.shader_filter;
- params.sx = task.shader_x;
- params.offset = task.offset;
- params.sample = sample;
-
- check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
- &params,
- sizeof(params),
- cuda_stream[thread_index]));
-
- OptixShaderBindingTable sbt_params = {};
- sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
- sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
- sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
- sbt_params.missRecordCount = 1;
- sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
- sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-# if OPTIX_ABI_VERSION >= 36
- sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-# else
- sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
-# endif
- sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
- sbt_params.callablesRecordCount = 3;
- sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
- check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
- cuda_stream[thread_index],
- launch_params_ptr,
- launch_params.data_elements,
- &sbt_params,
- task.shader_w,
- 1,
- 1));
-
- check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
- task.update_progress(NULL);
- }
- }
-
- bool build_optix_bvh(BVHOptiX *bvh,
- OptixBuildOperation operation,
- const OptixBuildInput &build_input,
- uint16_t num_motion_steps)
- {
- /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
- * from running out of memory (since both original and compacted acceleration structure memory
- * may be allocated at the same time for the duration of this function). The builds would
- * otherwise happen on the same CUDA stream anyway. */
- static thread_mutex mutex;
- thread_scoped_lock lock(mutex);
-
- const CUDAContextScope scope(cuContext);
-
- const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
- // Compute memory usage
- OptixAccelBufferSizes sizes = {};
- OptixAccelBuildOptions options = {};
- options.operation = operation;
- if (use_fast_trace_bvh) {
- VLOG(2) << "Using fast to trace OptiX BVH";
- options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
- }
- else {
- VLOG(2) << "Using fast to update OptiX BVH";
- options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
- }
-
- options.motionOptions.numKeys = num_motion_steps;
- options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
- options.motionOptions.timeBegin = 0.0f;
- options.motionOptions.timeEnd = 1.0f;
-
- check_result_optix_ret(
- optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
- // Allocate required output buffers
- device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
- temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
- if (!temp_mem.device_pointer)
- return false; // Make sure temporary memory allocation succeeded
-
- // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
- device_only_memory<char> &out_data = bvh->as_data;
- if (operation == OPTIX_BUILD_OPERATION_BUILD) {
- assert(out_data.device == this);
- out_data.alloc_to_device(sizes.outputSizeInBytes);
- if (!out_data.device_pointer)
- return false;
- }
- else {
- assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
- }
-
- // Finally build the acceleration structure
- OptixAccelEmitDesc compacted_size_prop = {};
- compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
- // A tiny space was allocated for this property at the end of the temporary buffer above
- // Make sure this pointer is 8-byte aligned
- compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
- OptixTraversableHandle out_handle = 0;
- check_result_optix_ret(optixAccelBuild(context,
- NULL,
- &options,
- &build_input,
- 1,
- temp_mem.device_pointer,
- sizes.tempSizeInBytes,
- out_data.device_pointer,
- sizes.outputSizeInBytes,
- &out_handle,
- use_fast_trace_bvh ? &compacted_size_prop : NULL,
- use_fast_trace_bvh ? 1 : 0));
- bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
- // Wait for all operations to finish
- check_result_cuda_ret(cuStreamSynchronize(NULL));
-
- // Compact acceleration structure to save memory (only if using fast trace as the
- // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
- if (use_fast_trace_bvh) {
- uint64_t compacted_size = sizes.outputSizeInBytes;
- check_result_cuda_ret(
- cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
- // Temporary memory is no longer needed, so free it now to make space
- temp_mem.free();
-
- // There is no point compacting if the size does not change
- if (compacted_size < sizes.outputSizeInBytes) {
- device_only_memory<char> compacted_data(this, "optix compacted as", false);
- compacted_data.alloc_to_device(compacted_size);
- if (!compacted_data.device_pointer)
- // Do not compact if memory allocation for compacted acceleration structure fails
- // Can just use the uncompacted one then, so succeed here regardless
- return true;
-
- check_result_optix_ret(optixAccelCompact(context,
- NULL,
- out_handle,
- compacted_data.device_pointer,
- compacted_size,
- &out_handle));
- bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
- // Wait for compaction to finish
- check_result_cuda_ret(cuStreamSynchronize(NULL));
-
- std::swap(out_data.device_size, compacted_data.device_size);
- std::swap(out_data.device_pointer, compacted_data.device_pointer);
- // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
- }
- }
-
- return true;
- }
-
- void build_bvh(BVH *bvh, Progress &progress, bool refit) override
- {
- if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
- /* For baking CUDA is used, build appropriate BVH for that. */
- Device::build_bvh(bvh, progress, refit);
- return;
- }
-
- const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
- free_bvh_memory_delayed();
-
- BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
- progress.set_substatus("Building OptiX acceleration structure");
-
- if (!bvh->params.top_level) {
- assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
- OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
- /* Refit is only possible when using fast to trace BVH (because AS is built with
- * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
- if (refit && !use_fast_trace_bvh) {
- assert(bvh_optix->traversable_handle != 0);
- operation = OPTIX_BUILD_OPERATION_UPDATE;
- }
- else {
- bvh_optix->as_data.free();
- bvh_optix->traversable_handle = 0;
- }
-
- // Build bottom level acceleration structures (BLAS)
- Geometry *const geom = bvh->geometry[0];
- if (geom->geometry_type == Geometry::HAIR) {
- // Build BLAS for curve primitives
- Hair *const hair = static_cast<Hair *const>(geom);
- if (hair->num_curves() == 0) {
- return;
- }
-
- const size_t num_segments = hair->num_segments();
-
- size_t num_motion_steps = 1;
- Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
- num_motion_steps = hair->get_motion_steps();
- }
-
- device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-# if OPTIX_ABI_VERSION >= 36
- device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
- device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
- // Four control points for each curve segment
- const size_t num_vertices = num_segments * 4;
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- index_data.alloc(num_segments);
- vertex_data.alloc(num_vertices * num_motion_steps);
- }
- else
-# endif
- aabb_data.alloc(num_segments * num_motion_steps);
-
- // Get AABBs for each motion step
- for (size_t step = 0; step < num_motion_steps; ++step) {
- // The center step for motion vertices is not stored in the attribute
- const float3 *keys = hair->get_curve_keys().data();
- size_t center_step = (num_motion_steps - 1) / 2;
- if (step != center_step) {
- size_t attr_offset = (step > center_step) ? step - 1 : step;
- // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
- keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
- }
-
- for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
- const Hair::Curve curve = hair->get_curve(j);
-# if OPTIX_ABI_VERSION >= 36
- const array<float> &curve_radius = hair->get_curve_radius();
-# endif
-
- for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- int k0 = curve.first_key + segment;
- int k1 = k0 + 1;
- int ka = max(k0 - 1, curve.first_key);
- int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
- const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
- const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
- const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
- const float4 pw = make_float4(
- curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
- // Convert Catmull-Rom data to Bezier spline
- static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
- static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
- static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
- static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
- index_data[i] = i * 4;
- float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
- v[0] = make_float4(
- dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
- v[1] = make_float4(
- dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
- v[2] = make_float4(
- dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
- v[3] = make_float4(
- dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
- }
- else
-# endif
- {
- BoundBox bounds = BoundBox::empty;
- curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
- const size_t index = step * num_segments + i;
- aabb_data[index].minX = bounds.min.x;
- aabb_data[index].minY = bounds.min.y;
- aabb_data[index].minZ = bounds.min.z;
- aabb_data[index].maxX = bounds.max.x;
- aabb_data[index].maxY = bounds.max.y;
- aabb_data[index].maxZ = bounds.max.z;
- }
- }
- }
- }
-
- // Upload AABB data to GPU
- aabb_data.copy_to_device();
-# if OPTIX_ABI_VERSION >= 36
- index_data.copy_to_device();
- vertex_data.copy_to_device();
-# endif
-
- vector<device_ptr> aabb_ptrs;
- aabb_ptrs.reserve(num_motion_steps);
-# if OPTIX_ABI_VERSION >= 36
- vector<device_ptr> width_ptrs;
- vector<device_ptr> vertex_ptrs;
- width_ptrs.reserve(num_motion_steps);
- vertex_ptrs.reserve(num_motion_steps);
-# endif
- for (size_t step = 0; step < num_motion_steps; ++step) {
- aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-# if OPTIX_ABI_VERSION >= 36
- const device_ptr base_ptr = vertex_data.device_pointer +
- step * num_vertices * sizeof(float4);
- width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size
- vertex_ptrs.push_back(base_ptr);
-# endif
- }
-
- // Force a single any-hit call, so shadow record-all behavior works correctly
- unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
- OptixBuildInput build_input = {};
-# if OPTIX_ABI_VERSION >= 36
- if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
- build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
- build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
- build_input.curveArray.numPrimitives = num_segments;
- build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
- build_input.curveArray.numVertices = num_vertices;
- build_input.curveArray.vertexStrideInBytes = sizeof(float4);
- build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
- build_input.curveArray.widthStrideInBytes = sizeof(float4);
- build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
- build_input.curveArray.indexStrideInBytes = sizeof(int);
- build_input.curveArray.flag = build_flags;
- build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
- }
- else
-# endif
- {
- // Disable visibility test any-hit program, since it is already checked during
- // intersection. Those trace calls that require anyhit can force it with a ray flag.
- build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
- build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-# if OPTIX_ABI_VERSION < 23
- build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
- build_input.aabbArray.numPrimitives = num_segments;
- build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
- build_input.aabbArray.flags = &build_flags;
- build_input.aabbArray.numSbtRecords = 1;
- build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-# else
- build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
- build_input.customPrimitiveArray.numPrimitives = num_segments;
- build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
- build_input.customPrimitiveArray.flags = &build_flags;
- build_input.customPrimitiveArray.numSbtRecords = 1;
- build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-# endif
- }
-
- if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- }
- else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
- // Build BLAS for triangle primitives
- Mesh *const mesh = static_cast<Mesh *const>(geom);
- if (mesh->num_triangles() == 0) {
- return;
- }
-
- const size_t num_verts = mesh->get_verts().size();
-
- size_t num_motion_steps = 1;
- Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
- num_motion_steps = mesh->get_motion_steps();
- }
-
- device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
- index_data.alloc(mesh->get_triangles().size());
- memcpy(index_data.data(),
- mesh->get_triangles().data(),
- mesh->get_triangles().size() * sizeof(int));
- device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
- vertex_data.alloc(num_verts * num_motion_steps);
-
- for (size_t step = 0; step < num_motion_steps; ++step) {
- const float3 *verts = mesh->get_verts().data();
-
- size_t center_step = (num_motion_steps - 1) / 2;
- // The center step for motion vertices is not stored in the attribute
- if (step != center_step) {
- verts = motion_keys->data_float3() +
- (step > center_step ? step - 1 : step) * num_verts;
- }
-
- memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
- }
-
- // Upload triangle data to GPU
- index_data.copy_to_device();
- vertex_data.copy_to_device();
-
- vector<device_ptr> vertex_ptrs;
- vertex_ptrs.reserve(num_motion_steps);
- for (size_t step = 0; step < num_motion_steps; ++step) {
- vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
- }
-
- // Force a single any-hit call, so shadow record-all behavior works correctly
- unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
- OptixBuildInput build_input = {};
- build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
- build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
- build_input.triangleArray.numVertices = num_verts;
- build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
- build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
- build_input.triangleArray.indexBuffer = index_data.device_pointer;
- build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
- build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
- build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
- build_input.triangleArray.flags = &build_flags;
- // The SBT does not store per primitive data since Cycles already allocates separate
- // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
- // one and rely on that having the same meaning in this case.
- build_input.triangleArray.numSbtRecords = 1;
- build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
- if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- }
- }
- else {
- unsigned int num_instances = 0;
- unsigned int max_num_instances = 0xFFFFFFFF;
-
- bvh_optix->as_data.free();
- bvh_optix->traversable_handle = 0;
- bvh_optix->motion_transform_data.free();
-
- optixDeviceContextGetProperty(context,
- OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
- &max_num_instances,
- sizeof(max_num_instances));
- // Do not count first bit, which is used to distinguish instanced and non-instanced objects
- max_num_instances >>= 1;
- if (bvh->objects.size() > max_num_instances) {
- progress.set_error(
- "Failed to build OptiX acceleration structure because there are too many instances");
- return;
- }
-
- // Fill instance descriptions
-# if OPTIX_ABI_VERSION < 41
- device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
- aabbs.alloc(bvh->objects.size());
-# endif
- device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
- instances.alloc(bvh->objects.size());
-
- // Calculate total motion transform size and allocate memory for them
- size_t motion_transform_offset = 0;
- if (motion_blur) {
- size_t total_motion_transform_size = 0;
- for (Object *const ob : bvh->objects) {
- if (ob->is_traceable() && ob->use_motion()) {
- total_motion_transform_size = align_up(total_motion_transform_size,
- OPTIX_TRANSFORM_BYTE_ALIGNMENT);
- const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
- total_motion_transform_size = total_motion_transform_size +
- sizeof(OptixSRTMotionTransform) +
- motion_keys * sizeof(OptixSRTData);
- }
- }
-
- assert(bvh_optix->motion_transform_data.device == this);
- bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
- }
-
- for (Object *ob : bvh->objects) {
- // Skip non-traceable objects
- if (!ob->is_traceable())
- continue;
-
- BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
- OptixTraversableHandle handle = blas->traversable_handle;
-
-# if OPTIX_ABI_VERSION < 41
- OptixAabb &aabb = aabbs[num_instances];
- aabb.minX = ob->bounds.min.x;
- aabb.minY = ob->bounds.min.y;
- aabb.minZ = ob->bounds.min.z;
- aabb.maxX = ob->bounds.max.x;
- aabb.maxY = ob->bounds.max.y;
- aabb.maxZ = ob->bounds.max.z;
-# endif
-
- OptixInstance &instance = instances[num_instances++];
- memset(&instance, 0, sizeof(instance));
-
- // Clear transform to identity matrix
- instance.transform[0] = 1.0f;
- instance.transform[5] = 1.0f;
- instance.transform[10] = 1.0f;
-
- // Set user instance ID to object index (but leave low bit blank)
- instance.instanceId = ob->get_device_index() << 1;
-
- // Have to have at least one bit in the mask, or else instance would always be culled
- instance.visibilityMask = 1;
-
- if (ob->get_geometry()->has_volume) {
- // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
- instance.visibilityMask |= 2;
- }
-
- if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
- // Same applies to curves (so they can be skipped in local trace calls)
- instance.visibilityMask |= 4;
-
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur && ob->get_geometry()->has_motion_blur() &&
- DebugFlags().optix.curves_api &&
- static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
- // Select between motion blur and non-motion blur built-in intersection module
- instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
- }
-# endif
- }
-
- // Insert motion traversable if object has motion
- if (motion_blur && ob->use_motion()) {
- size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
- size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
- motion_keys * sizeof(OptixSRTData);
-
- const CUDAContextScope scope(cuContext);
-
- motion_transform_offset = align_up(motion_transform_offset,
- OPTIX_TRANSFORM_BYTE_ALIGNMENT);
- CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
- motion_transform_offset;
- motion_transform_offset += motion_transform_size;
-
- // Allocate host side memory for motion transform and fill it with transform data
- OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
- new uint8_t[motion_transform_size]);
- motion_transform.child = handle;
- motion_transform.motionOptions.numKeys = ob->get_motion().size();
- motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
- motion_transform.motionOptions.timeBegin = 0.0f;
- motion_transform.motionOptions.timeEnd = 1.0f;
-
- OptixSRTData *const srt_data = motion_transform.srtData;
- array<DecomposedTransform> decomp(ob->get_motion().size());
- transform_motion_decompose(
- decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
- for (size_t i = 0; i < ob->get_motion().size(); ++i) {
- // Scale
- srt_data[i].sx = decomp[i].y.w; // scale.x.x
- srt_data[i].sy = decomp[i].z.w; // scale.y.y
- srt_data[i].sz = decomp[i].w.w; // scale.z.z
-
- // Shear
- srt_data[i].a = decomp[i].z.x; // scale.x.y
- srt_data[i].b = decomp[i].z.y; // scale.x.z
- srt_data[i].c = decomp[i].w.x; // scale.y.z
- assert(decomp[i].z.z == 0.0f); // scale.y.x
- assert(decomp[i].w.y == 0.0f); // scale.z.x
- assert(decomp[i].w.z == 0.0f); // scale.z.y
-
- // Pivot point
- srt_data[i].pvx = 0.0f;
- srt_data[i].pvy = 0.0f;
- srt_data[i].pvz = 0.0f;
-
- // Rotation
- srt_data[i].qx = decomp[i].x.x;
- srt_data[i].qy = decomp[i].x.y;
- srt_data[i].qz = decomp[i].x.z;
- srt_data[i].qw = decomp[i].x.w;
-
- // Translation
- srt_data[i].tx = decomp[i].y.x;
- srt_data[i].ty = decomp[i].y.y;
- srt_data[i].tz = decomp[i].y.z;
- }
-
- // Upload motion transform to GPU
- cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
- delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
- // Disable instance transform if object uses motion transform already
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
- // Get traversable handle to motion transform
- optixConvertPointerToTraversableHandle(context,
- motion_transform_gpu,
- OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
- &instance.traversableHandle);
- }
- else {
- instance.traversableHandle = handle;
-
- if (ob->get_geometry()->is_instanced()) {
- // Set transform matrix
- memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
- }
- else {
- // Disable instance transform if geometry already has it applied to vertex data
- instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
- // Non-instanced objects read ID from 'prim_object', so distinguish
- // them from instanced objects with the low bit set
- instance.instanceId |= 1;
- }
- }
- }
-
- // Upload instance descriptions
-# if OPTIX_ABI_VERSION < 41
- aabbs.resize(num_instances);
- aabbs.copy_to_device();
-# endif
- instances.resize(num_instances);
- instances.copy_to_device();
-
- // Build top-level acceleration structure (TLAS)
- OptixBuildInput build_input = {};
- build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-# if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
- build_input.instanceArray.aabbs = aabbs.device_pointer;
- build_input.instanceArray.numAabbs = num_instances;
-# endif
- build_input.instanceArray.instances = instances.device_pointer;
- build_input.instanceArray.numInstances = num_instances;
-
- if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
- progress.set_error("Failed to build OptiX acceleration structure");
- }
- tlas_handle = bvh_optix->traversable_handle;
- }
- }
-
- void release_optix_bvh(BVH *bvh) override
- {
- thread_scoped_lock lock(delayed_free_bvh_mutex);
- /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
- * while GPU is still rendering. */
- BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
- delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
- delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
- bvh_optix->traversable_handle = 0;
- }
-
- void free_bvh_memory_delayed()
- {
- thread_scoped_lock lock(delayed_free_bvh_mutex);
- delayed_free_bvh_memory.free_memory();
- }
-
- void const_copy_to(const char *name, void *host, size_t size) override
- {
- // Set constant memory for CUDA module
- // TODO(pmours): This is only used for tonemapping (see 'film_convert').
- // Could be removed by moving those functions to filter CUDA module.
- CUDADevice::const_copy_to(name, host, size);
-
- if (strcmp(name, "__data") == 0) {
- assert(size <= sizeof(KernelData));
-
- // Update traversable handle (since it is different for each device on multi devices)
- KernelData *const data = (KernelData *)host;
- *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
- update_launch_params(offsetof(KernelParams, data), host, size);
- return;
- }
-
- // Update data storage pointers in launch parameters
-# define KERNEL_TEX(data_type, tex_name) \
- if (strcmp(name, #tex_name) == 0) { \
- update_launch_params(offsetof(KernelParams, tex_name), host, size); \
- return; \
- }
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
- }
-
- void update_launch_params(size_t offset, void *data, size_t data_size)
- {
- const CUDAContextScope scope(cuContext);
-
- for (int i = 0; i < info.cpu_threads; ++i)
- check_result_cuda(
- cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
- data,
- data_size));
- }
-
- void task_add(DeviceTask &task) override
- {
- // Upload texture information to device if it has changed since last launch
- load_texture_info();
-
- if (task.type == DeviceTask::FILM_CONVERT) {
- // Execute in main thread because of OpenGL access
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- return;
- }
-
- if (task.type == DeviceTask::DENOISE_BUFFER) {
- // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy, 0);
- });
- return;
- }
-
- // Split task into smaller ones
- list<DeviceTask> tasks;
- task.split(tasks, info.cpu_threads);
-
- // Queue tasks in internal task pool
- int task_index = 0;
- for (DeviceTask &task : tasks) {
- task_pool.push([=] {
- // Using task index parameter instead of thread index, since number of CUDA streams may
- // differ from number of threads
- DeviceTask task_copy = task;
- thread_run(task_copy, task_index);
- });
- task_index++;
- }
- }
-
- void task_wait() override
- {
- // Wait for all queued tasks to finish
- task_pool.wait_work();
- }
-
- void task_cancel() override
- {
- // Cancel any remaining tasks in the internal pool
- task_pool.cancel();
- }
-};
-
-bool device_optix_init()
-{
- if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
- return true; // Already initialized function table
-
- // Need to initialize CUDA as well
- if (!device_cuda_init())
- return false;
-
- const OptixResult result = optixInit();
-
- if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
- VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
- "Please update to the latest driver first!";
- return false;
- }
- else if (result != OPTIX_SUCCESS) {
- VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
- return false;
- }
-
- // Loaded OptiX successfully!
- return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
- devices.reserve(cuda_devices.size());
-
- // Simply add all supported CUDA devices as OptiX devices again
- for (DeviceInfo info : cuda_devices) {
- assert(info.type == DEVICE_CUDA);
-
- int major;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
- if (major < 5) {
- continue; // Only Maxwell and up are supported by OptiX
- }
-
- info.type = DEVICE_OPTIX;
- info.id += "_OptiX";
- info.denoisers |= DENOISER_OPTIX;
- info.has_branched_path = false;
-
- devices.push_back(info);
- }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
- return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+ : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+ DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+ if (VLOG_IS_ON(3)) {
+ /* Print kernel execution times sorted by time. */
+ vector<pair<DeviceKernelMask, double>> stats_sorted;
+ for (const auto &stat : stats_kernel_time_) {
+ stats_sorted.push_back(stat);
+ }
+
+ sort(stats_sorted.begin(),
+ stats_sorted.end(),
+ [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+ return a.second > b.second;
+ });
+
+ VLOG(3) << "GPU queue stats:";
+ for (const auto &[mask, time] : stats_sorted) {
+ VLOG(3) << " " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+ << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+ }
+ }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+ if (VLOG_IS_ON(3)) {
+ last_sync_time_ = time_dt();
+ last_kernels_enqueued_ = 0;
+ }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+ if (VLOG_IS_ON(3)) {
+ VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
+ last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+ }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+ if (VLOG_IS_ON(3)) {
+ const double new_time = time_dt();
+ const double elapsed_time = new_time - last_sync_time_;
+ VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+ stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+ last_sync_time_ = new_time;
+ last_kernels_enqueued_ = 0;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+ virtual ~DeviceQueue();
+
+ /* Number of concurrent states to process for integrator,
+ * based on number of cores and/or available memory. */
+ virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+ /* Number of states which keeps the device occupied with work without loosing performance.
+ * The renderer will add more work (when available) when number of active paths falls below this
+ * value. */
+ virtual int num_concurrent_busy_states() const = 0;
+
+ /* Initialize execution of kernels on this queue.
+ *
+ * Will, for example, load all data required by the kernels from Device to global or path state.
+ *
+ * Use this method after device synchronization has finished before enqueueing any kernels. */
+ virtual void init_execution() = 0;
+
+ /* Test if an optional device kernel is available. */
+ virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+ /* Enqueue kernel execution.
+ *
+ * Execute the kernel work_size times on the device.
+ * Supported arguments types:
+ * - int: pass pointer to the int
+ * - device memory: pass pointer to device_memory.device_pointer
+ * Return false if there was an error executing this or a previous kernel. */
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+ /* Wait unit all enqueued kernels have finished execution.
+ * Return false if there was an error executing any of the enqueued kernels. */
+ virtual bool synchronize() = 0;
+
+ /* Copy memory to/from device as part of the command queue, to ensure
+ * operations are done in order without having to synchronize. */
+ virtual void zero_to_device(device_memory &mem) = 0;
+ virtual void copy_to_device(device_memory &mem) = 0;
+ virtual void copy_from_device(device_memory &mem) = 0;
+
+ /* Graphics resources interoperability.
+ *
+ * The interoperability comes here by the meaning that the device is capable of computing result
+ * directly into an OpenGL (or other graphics library) buffer. */
+
+ /* Create graphics interoperability context which will be taking care of mapping graphics
+ * resource as a buffer writable by kernels of this device. */
+ virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+ {
+ LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+ return nullptr;
+ }
+
+ /* Device this queue has been created for. */
+ Device *device;
+
+ protected:
+ /* Hide construction so that allocation via `Device` API is enforced. */
+ explicit DeviceQueue(Device *device);
+
+ /* Implementations call these from the corresponding methods to generate debugging logs. */
+ void debug_init_execution();
+ void debug_enqueue(DeviceKernel kernel, const int work_size);
+ void debug_synchronize();
+
+ /* Combination of kernels enqueued together sync last synchronize. */
+ DeviceKernelMask last_kernels_enqueued_;
+ /* Time of synchronize call. */
+ double last_sync_time_;
+ /* Accumulated execution time for combinations of kernels launched together. */
+ map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
- : device(device),
- split_data(device, "split_data"),
- ray_state(device, "ray_state", MEM_READ_WRITE),
- queue_index(device, "queue_index"),
- use_queues_flag(device, "use_queues_flag"),
- work_pool_wgs(device, "work_pool_wgs"),
- kernel_data_initialized(false)
-{
- avg_time_per_sample = 0.0;
-
- kernel_path_init = NULL;
- kernel_scene_intersect = NULL;
- kernel_lamp_emission = NULL;
- kernel_do_volume = NULL;
- kernel_queue_enqueue = NULL;
- kernel_indirect_background = NULL;
- kernel_shader_setup = NULL;
- kernel_shader_sort = NULL;
- kernel_shader_eval = NULL;
- kernel_holdout_emission_blurring_pathtermination_ao = NULL;
- kernel_subsurface_scatter = NULL;
- kernel_direct_lighting = NULL;
- kernel_shadow_blocked_ao = NULL;
- kernel_shadow_blocked_dl = NULL;
- kernel_enqueue_inactive = NULL;
- kernel_next_iteration_setup = NULL;
- kernel_indirect_subsurface = NULL;
- kernel_buffer_update = NULL;
- kernel_adaptive_stopping = NULL;
- kernel_adaptive_filter_x = NULL;
- kernel_adaptive_filter_y = NULL;
- kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
- split_data.free();
- ray_state.free();
- use_queues_flag.free();
- queue_index.free();
- work_pool_wgs.free();
-
- delete kernel_path_init;
- delete kernel_scene_intersect;
- delete kernel_lamp_emission;
- delete kernel_do_volume;
- delete kernel_queue_enqueue;
- delete kernel_indirect_background;
- delete kernel_shader_setup;
- delete kernel_shader_sort;
- delete kernel_shader_eval;
- delete kernel_holdout_emission_blurring_pathtermination_ao;
- delete kernel_subsurface_scatter;
- delete kernel_direct_lighting;
- delete kernel_shadow_blocked_ao;
- delete kernel_shadow_blocked_dl;
- delete kernel_enqueue_inactive;
- delete kernel_next_iteration_setup;
- delete kernel_indirect_subsurface;
- delete kernel_buffer_update;
- delete kernel_adaptive_stopping;
- delete kernel_adaptive_filter_x;
- delete kernel_adaptive_filter_y;
- delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
- kernel_##name = get_split_kernel_function(#name, requested_features); \
- if (!kernel_##name) { \
- device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
- return false; \
- }
-
- LOAD_KERNEL(path_init);
- LOAD_KERNEL(scene_intersect);
- LOAD_KERNEL(lamp_emission);
- if (requested_features.use_volume) {
- LOAD_KERNEL(do_volume);
- }
- LOAD_KERNEL(queue_enqueue);
- LOAD_KERNEL(indirect_background);
- LOAD_KERNEL(shader_setup);
- LOAD_KERNEL(shader_sort);
- LOAD_KERNEL(shader_eval);
- LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
- LOAD_KERNEL(subsurface_scatter);
- LOAD_KERNEL(direct_lighting);
- LOAD_KERNEL(shadow_blocked_ao);
- LOAD_KERNEL(shadow_blocked_dl);
- LOAD_KERNEL(enqueue_inactive);
- LOAD_KERNEL(next_iteration_setup);
- LOAD_KERNEL(indirect_subsurface);
- LOAD_KERNEL(buffer_update);
- LOAD_KERNEL(adaptive_stopping);
- LOAD_KERNEL(adaptive_filter_x);
- LOAD_KERNEL(adaptive_filter_y);
- LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
- /* Re-initialiaze kernel-dependent data when kernels change. */
- kernel_data_initialized = false;
-
- return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
- device_memory &data,
- uint64_t max_buffer_size)
-{
- uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
- VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
- << " bytes. (" << string_human_readable_size(size_per_element) << ").";
- return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
- RenderTile &tile,
- device_memory &kgbuffer,
- device_memory &kernel_data)
-{
- if (device->have_error()) {
- return false;
- }
-
- /* Allocate all required global memory once. */
- if (!kernel_data_initialized) {
- kernel_data_initialized = true;
-
- /* Set local size */
- int2 lsize = split_kernel_local_size();
- local_size[0] = lsize[0];
- local_size[1] = lsize[1];
-
- /* Set global size */
- int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
- /* Make sure that set work size is a multiple of local
- * work size dimensions.
- */
- global_size[0] = round_up(gsize[0], local_size[0]);
- global_size[1] = round_up(gsize[1], local_size[1]);
-
- int num_global_elements = global_size[0] * global_size[1];
- assert(num_global_elements % WORK_POOL_SIZE == 0);
-
- /* Calculate max groups */
-
- /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
- unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
- WORK_POOL_SIZE_GPU;
- unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
- /* Allocate work_pool_wgs memory. */
- work_pool_wgs.alloc_to_device(max_work_groups);
- queue_index.alloc_to_device(NUM_QUEUES);
- use_queues_flag.alloc_to_device(1);
- split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
- ray_state.alloc(num_global_elements);
- }
-
- /* Number of elements in the global state buffer */
- int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
- if (device->have_error()) { \
- return false; \
- } \
- if (!kernel_##name->enqueue( \
- KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
- return false; \
- }
-
- tile.sample = tile.start_sample;
-
- /* for exponential increase between tile updates */
- int time_multiplier = 1;
-
- while (tile.sample < tile.start_sample + tile.num_samples) {
- /* to keep track of how long it takes to run a number of samples */
- double start_time = time_dt();
-
- /* initial guess to start rolling average */
- const int initial_num_samples = 1;
- /* approx number of samples per second */
- const int samples_per_second = (avg_time_per_sample > 0.0) ?
- int(double(time_multiplier) / avg_time_per_sample) + 1 :
- initial_num_samples;
-
- RenderTile subtile = tile;
- subtile.start_sample = tile.sample;
- subtile.num_samples = samples_per_second;
-
- if (task.adaptive_sampling.use) {
- subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
- subtile.num_samples);
- }
-
- /* Don't go beyond requested number of samples. */
- subtile.num_samples = min(subtile.num_samples,
- tile.start_sample + tile.num_samples - tile.sample);
-
- if (device->have_error()) {
- return false;
- }
-
- /* reset state memory here as global size for data_init
- * kernel might not be large enough to do in kernel
- */
- work_pool_wgs.zero_to_device();
- split_data.zero_to_device();
- ray_state.zero_to_device();
-
- if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
- subtile,
- num_global_elements,
- kgbuffer,
- kernel_data,
- split_data,
- ray_state,
- queue_index,
- use_queues_flag,
- work_pool_wgs)) {
- return false;
- }
-
- ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
- bool activeRaysAvailable = true;
- double cancel_time = DBL_MAX;
-
- while (activeRaysAvailable) {
- /* Do path-iteration in host [Enqueue Path-iteration kernels. */
- for (int PathIter = 0; PathIter < 16; PathIter++) {
- ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
- if (kernel_do_volume) {
- ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
- }
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(
- holdout_emission_blurring_pathtermination_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
- if (task.get_cancel() && cancel_time == DBL_MAX) {
- /* Wait up to twice as many seconds for current samples to finish
- * to avoid artifacts in render result from ending too soon.
- */
- cancel_time = time_dt() + 2.0 * time_multiplier;
- }
-
- if (time_dt() > cancel_time) {
- return true;
- }
- }
-
- /* Decide if we should exit path-iteration in host. */
- ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
- activeRaysAvailable = false;
-
- for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
- if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
- if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
- /* Something went wrong, abort to avoid looping endlessly. */
- device->set_error("Split kernel error: invalid ray state");
- return false;
- }
-
- /* Not all rays are RAY_INACTIVE. */
- activeRaysAvailable = true;
- break;
- }
- }
-
- if (time_dt() > cancel_time) {
- return true;
- }
- }
-
- int filter_sample = tile.sample + subtile.num_samples - 1;
- if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
- size_t buffer_size[2];
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(tile.h, local_size[1]);
- kernel_adaptive_stopping->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- buffer_size[0] = round_up(tile.h, local_size[0]);
- buffer_size[1] = round_up(1, local_size[1]);
- kernel_adaptive_filter_x->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(1, local_size[1]);
- kernel_adaptive_filter_y->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- }
-
- double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
- if (avg_time_per_sample == 0.0) {
- /* start rolling average */
- avg_time_per_sample = time_per_sample;
- }
- else {
- avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
- }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
- tile.sample += subtile.num_samples;
- task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
- time_multiplier = min(time_multiplier << 1, 10);
-
- if (task.get_cancel()) {
- return true;
- }
- }
-
- if (task.adaptive_sampling.use) {
- /* Reset the start samples. */
- RenderTile subtile = tile;
- subtile.start_sample = tile.start_sample;
- subtile.num_samples = tile.sample - tile.start_sample;
- enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
- subtile,
- num_global_elements,
- kgbuffer,
- kernel_data,
- split_data,
- ray_state,
- queue_index,
- use_queues_flag,
- work_pool_wgs);
- size_t buffer_size[2];
- buffer_size[0] = round_up(tile.w, local_size[0]);
- buffer_size[1] = round_up(tile.h, local_size[1]);
- kernel_adaptive_adjust_samples->enqueue(
- KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
- }
-
- return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
- size_t global_size[2];
- size_t local_size[2];
-
- KernelDimensions(size_t global_size_[2], size_t local_size_[2])
- {
- memcpy(global_size, global_size_, sizeof(global_size));
- memcpy(local_size, local_size_, sizeof(local_size));
- }
-};
-
-class SplitKernelFunction {
- public:
- virtual ~SplitKernelFunction()
- {
- }
-
- /* enqueue the kernel, returns false if there is an error */
- virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
- Device *device;
-
- SplitKernelFunction *kernel_path_init;
- SplitKernelFunction *kernel_scene_intersect;
- SplitKernelFunction *kernel_lamp_emission;
- SplitKernelFunction *kernel_do_volume;
- SplitKernelFunction *kernel_queue_enqueue;
- SplitKernelFunction *kernel_indirect_background;
- SplitKernelFunction *kernel_shader_setup;
- SplitKernelFunction *kernel_shader_sort;
- SplitKernelFunction *kernel_shader_eval;
- SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
- SplitKernelFunction *kernel_subsurface_scatter;
- SplitKernelFunction *kernel_direct_lighting;
- SplitKernelFunction *kernel_shadow_blocked_ao;
- SplitKernelFunction *kernel_shadow_blocked_dl;
- SplitKernelFunction *kernel_enqueue_inactive;
- SplitKernelFunction *kernel_next_iteration_setup;
- SplitKernelFunction *kernel_indirect_subsurface;
- SplitKernelFunction *kernel_buffer_update;
- SplitKernelFunction *kernel_adaptive_stopping;
- SplitKernelFunction *kernel_adaptive_filter_x;
- SplitKernelFunction *kernel_adaptive_filter_y;
- SplitKernelFunction *kernel_adaptive_adjust_samples;
-
- /* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
- device_only_memory<uchar> split_data;
- device_vector<uchar> ray_state;
- device_only_memory<int>
- queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
- /* Flag to make sceneintersect and lampemission kernel use queues. */
- device_only_memory<char> use_queues_flag;
-
- /* Approximate time it takes to complete one sample */
- double avg_time_per_sample;
-
- /* Work pool with respect to each work group. */
- device_only_memory<unsigned int> work_pool_wgs;
-
- /* Cached kernel-dependent data, initialized once. */
- bool kernel_data_initialized;
- size_t local_size[2];
- size_t global_size[2];
-
- public:
- explicit DeviceSplitKernel(Device *device);
- virtual ~DeviceSplitKernel();
-
- bool load_kernels(const DeviceRequestedFeatures &requested_features);
- bool path_trace(DeviceTask &task,
- RenderTile &rtile,
- device_memory &kgbuffer,
- device_memory &kernel_data);
-
- virtual uint64_t state_buffer_size(device_memory &kg,
- device_memory &data,
- size_t num_threads) = 0;
- size_t max_elements_for_max_buffer_size(device_memory &kg,
- device_memory &data,
- uint64_t max_buffer_size);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data_,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs) = 0;
-
- virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
- const DeviceRequestedFeatures &) = 0;
- virtual int2 split_kernel_local_size() = 0;
- virtual int2 split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
- : type(type_),
- x(0),
- y(0),
- w(0),
- h(0),
- rgba_byte(0),
- rgba_half(0),
- buffer(0),
- sample(0),
- num_samples(1),
- shader_input(0),
- shader_output(0),
- shader_eval_type(0),
- shader_filter(0),
- shader_x(0),
- shader_w(0),
- buffers(nullptr),
- tile_types(0),
- denoising_from_render(false),
- pass_stride(0),
- frame_stride(0),
- target_pass_stride(0),
- pass_denoising_data(0),
- pass_denoising_clean(0),
- need_finish_queue(false),
- integrator_branched(false)
-{
- last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
- if (max_size != 0) {
- int max_size_num;
-
- if (type == SHADER) {
- max_size_num = (shader_w + max_size - 1) / max_size;
- }
- else {
- max_size = max(1, max_size / w);
- max_size_num = (h + max_size - 1) / max_size;
- }
-
- num = max(max_size_num, num);
- }
-
- if (type == SHADER) {
- num = min(shader_w, num);
- }
- else if (type == RENDER) {
- }
- else {
- num = min(h, num);
- }
-
- return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
- num = get_subtask_count(num, max_size);
-
- if (type == SHADER) {
- for (int i = 0; i < num; i++) {
- int tx = shader_x + (shader_w / num) * i;
- int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
- DeviceTask task = *this;
-
- task.shader_x = tx;
- task.shader_w = tw;
-
- tasks.push_back(task);
- }
- }
- else if (type == RENDER) {
- for (int i = 0; i < num; i++)
- tasks.push_back(*this);
- }
- else {
- for (int i = 0; i < num; i++) {
- int ty = y + (h / num) * i;
- int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
- DeviceTask task = *this;
-
- task.y = ty;
- task.h = th;
-
- tasks.push_back(task);
- }
- }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
- if (type == FILM_CONVERT)
- return;
-
- if (update_progress_sample) {
- if (pixel_samples == -1) {
- pixel_samples = shader_w;
- }
- update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
- }
-
- if (update_tile_sample) {
- double current_time = time_dt();
-
- if (current_time - last_update_time >= 1.0) {
- update_tile_sample(*rtile);
-
- last_update_time = current_time;
- }
- }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
- int end_sample = sample + num_samples;
-
- /* Round down end sample to the nearest sample that needs filtering. */
- end_sample &= ~(adaptive_step - 1);
-
- if (end_sample <= sample) {
- /* In order to reach the next sample that needs filtering, we'd need
- * to increase num_samples. We don't do that in this function, so
- * just keep it as is and don't filter this time around. */
- return num_samples;
- }
- return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
- if (sample > min_samples) {
- return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
- }
- else {
- return false;
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
- DENOISER_NLM = 1,
- DENOISER_OPTIX = 2,
- DENOISER_OPENIMAGEDENOISE = 4,
- DENOISER_NUM,
-
- DENOISER_NONE = 0,
- DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
- DENOISER_INPUT_RGB = 1,
- DENOISER_INPUT_RGB_ALBEDO = 2,
- DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
- DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
- /* Apply denoiser to image. */
- bool use;
- /* Output denoising data passes (possibly without applying the denoiser). */
- bool store_passes;
-
- /* Denoiser type. */
- DenoiserType type;
-
- /* Viewport start sample. */
- int start_sample;
-
- /** Native Denoiser. */
-
- /* Pixel radius for neighboring pixels to take into account. */
- int radius;
- /* Controls neighbor pixel weighting for the denoising filter. */
- float strength;
- /* Preserve more or less detail based on feature passes. */
- float feature_strength;
- /* When removing pixels that don't carry information,
- * use a relative threshold instead of an absolute one. */
- bool relative_pca;
- /* How many frames before and after the current center frame are included. */
- int neighbor_frames;
- /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
- bool clamp_input;
-
- /** OIDN/Optix Denoiser. */
-
- /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
- DenoiserInput input_passes;
-
- DenoiseParams()
- {
- use = false;
- store_passes = false;
-
- type = DENOISER_NLM;
-
- radius = 8;
- strength = 0.5f;
- feature_strength = 0.5f;
- relative_pca = false;
- neighbor_frames = 2;
- clamp_input = true;
-
- /* Default to color + albedo only, since normal input does not always have the desired effect
- * when denoising with OptiX. */
- input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
- start_sample = 0;
- }
-
- /* Test if a denoising task needs to run, also to prefilter passes for the native
- * denoiser when we are not applying denoising to the combined image. */
- bool need_denoising_task() const
- {
- return (use || (store_passes && type == DENOISER_NLM));
- }
-};
-
-class AdaptiveSampling {
- public:
- AdaptiveSampling();
-
- int align_samples(int sample, int num_samples) const;
- bool need_filter(int sample) const;
-
- bool use;
- int adaptive_step;
- int min_samples;
-};
-
-class DeviceTask {
- public:
- typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
- Type type;
-
- int x, y, w, h;
- device_ptr rgba_byte;
- device_ptr rgba_half;
- device_ptr buffer;
- int sample;
- int num_samples;
- int offset, stride;
-
- device_ptr shader_input;
- device_ptr shader_output;
- int shader_eval_type;
- int shader_filter;
- int shader_x, shader_w;
-
- RenderBuffers *buffers;
-
- explicit DeviceTask(Type type = RENDER);
-
- int get_subtask_count(int num, int max_size = 0) const;
- void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
- void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
- function<bool(Device *device, RenderTile &, uint)> acquire_tile;
- function<void(long, int)> update_progress_sample;
- function<void(RenderTile &)> update_tile_sample;
- function<void(RenderTile &)> release_tile;
- function<bool()> get_cancel;
- function<bool()> get_tile_stolen;
- function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
- function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
- uint tile_types;
- DenoiseParams denoising;
- bool denoising_from_render;
- vector<int> denoising_frames;
-
- int pass_stride;
- int frame_stride;
- int target_pass_stride;
- int pass_denoising_data;
- int pass_denoising_clean;
-
- bool need_finish_queue;
- bool integrator_branched;
- AdaptiveSampling adaptive_sampling;
-
- protected:
- double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
* limitations under the License.
*/
+#include "device/dummy/device.h"
+
#include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
CCL_NAMESPACE_BEGIN
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
class DummyDevice : public Device {
public:
- DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
- : Device(info_, stats_, profiler_, background_)
+ DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+ : Device(info_, stats_, profiler_)
{
error_msg = info.error_msg;
}
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
virtual void const_copy_to(const char *, void *, size_t) override
{
}
-
- virtual void task_add(DeviceTask &) override
- {
- }
-
- virtual void task_wait() override
- {
- }
-
- virtual void task_cancel() override
- {
- }
};
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
- return new DummyDevice(info, stats, profiler, background);
+ return new DummyDevice(info, stats, profiler);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/device/dummy/device.h
index 8afaa686e28..832a9568129 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/device/dummy/device.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_do_volume.h"
+#pragma once
-#define KERNEL_NAME do_volume
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+ struct SubDevice {
+ Stats stats;
+ Device *device;
+ map<device_ptr, device_ptr> ptr_map;
+ int peer_island_index = -1;
+ };
+
+ list<SubDevice> devices;
+ device_ptr unique_key;
+ vector<vector<SubDevice *>> peer_islands;
+
+ MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : Device(info, stats, profiler), unique_key(1)
+ {
+ foreach (const DeviceInfo &subinfo, info.multi_devices) {
+ /* Always add CPU devices at the back since GPU devices can change
+ * host memory pointers, which CPU uses as device pointer. */
+ SubDevice *sub;
+ if (subinfo.type == DEVICE_CPU) {
+ devices.emplace_back();
+ sub = &devices.back();
+ }
+ else {
+ devices.emplace_front();
+ sub = &devices.front();
+ }
+
+ /* The pointer to 'sub->stats' will stay valid even after new devices
+ * are added, since 'devices' is a linked list. */
+ sub->device = Device::create(subinfo, sub->stats, profiler);
+ }
+
+ /* Build a list of peer islands for the available render devices */
+ foreach (SubDevice &sub, devices) {
+ /* First ensure that every device is in at least once peer island */
+ if (sub.peer_island_index < 0) {
+ peer_islands.emplace_back();
+ sub.peer_island_index = (int)peer_islands.size() - 1;
+ peer_islands[sub.peer_island_index].push_back(&sub);
+ }
+
+ if (!info.has_peer_memory) {
+ continue;
+ }
+
+ /* Second check peer access between devices and fill up the islands accordingly */
+ foreach (SubDevice &peer_sub, devices) {
+ if (peer_sub.peer_island_index < 0 &&
+ peer_sub.device->info.type == sub.device->info.type &&
+ peer_sub.device->check_peer_access(sub.device)) {
+ peer_sub.peer_island_index = sub.peer_island_index;
+ peer_islands[sub.peer_island_index].push_back(&peer_sub);
+ }
+ }
+ }
+ }
+
+ ~MultiDevice()
+ {
+ foreach (SubDevice &sub, devices)
+ delete sub.device;
+ }
+
+ const string &error_message() override
+ {
+ error_msg.clear();
+
+ foreach (SubDevice &sub, devices)
+ error_msg += sub.device->error_message();
+
+ return error_msg;
+ }
+
+ virtual bool show_samples() const override
+ {
+ if (devices.size() > 1) {
+ return false;
+ }
+ return devices.front().device->show_samples();
+ }
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const override
+ {
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+ BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+ foreach (const SubDevice &sub_device, devices) {
+ BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+ bvh_layout_mask &= device_bvh_layout_mask;
+ bvh_layout_mask_all |= device_bvh_layout_mask;
+ }
+
+ /* With multiple OptiX devices, every device needs its own acceleration structure */
+ if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+ return BVH_LAYOUT_MULTI_OPTIX;
+ }
+
+ /* When devices do not share a common BVH layout, fall back to creating one for each */
+ const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+ if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+ return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+ }
+
+ return bvh_layout_mask;
+ }
+
+ bool load_kernels(const uint kernel_features) override
+ {
+ foreach (SubDevice &sub, devices)
+ if (!sub.device->load_kernels(kernel_features))
+ return false;
+
+ return true;
+ }
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+ {
+ /* Try to build and share a single acceleration structure, if possible */
+ if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+ devices.back().device->build_bvh(bvh, progress, refit);
+ return;
+ }
+
+ assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+ bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+ BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+ bvh_multi->sub_bvhs.resize(devices.size());
+
+ vector<BVHMulti *> geom_bvhs;
+ geom_bvhs.reserve(bvh->geometry.size());
+ foreach (Geometry *geom, bvh->geometry) {
+ geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+ }
+
+ /* Broadcast acceleration structure build to all render devices */
+ size_t i = 0;
+ foreach (SubDevice &sub, devices) {
+ /* Change geometry BVH pointers to the sub BVH */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+ }
+
+ if (!bvh_multi->sub_bvhs[i]) {
+ BVHParams params = bvh->params;
+ if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+ params.bvh_layout = BVH_LAYOUT_OPTIX;
+ else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+ params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+ BVH_LAYOUT_EMBREE;
+
+ /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+ * (since they are put into the top level directly, see bvh_embree.cpp) */
+ if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+ !bvh->geometry[0]->is_instanced()) {
+ i++;
+ continue;
+ }
+
+ bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+ }
+
+ sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+ i++;
+ }
+
+ /* Change geometry BVH pointers back to the multi BVH. */
+ for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+ bvh->geometry[k]->bvh = geom_bvhs[k];
+ }
+ }
+
+ virtual void *get_cpu_osl_memory() override
+ {
+ if (devices.size() > 1) {
+ return NULL;
+ }
+ return devices.front().device->get_cpu_osl_memory();
+ }
+
+ bool is_resident(device_ptr key, Device *sub_device) override
+ {
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device) {
+ return find_matching_mem_device(key, sub)->device == sub_device;
+ }
+ }
+ return false;
+ }
+
+ SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+ {
+ assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+ /* Get the memory owner of this key (first try current device, then peer devices) */
+ SubDevice *owner_sub = &sub;
+ if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+ foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+ if (island_sub != owner_sub &&
+ island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+ owner_sub = island_sub;
+ }
+ }
+ }
+ return owner_sub;
+ }
+
+ SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+ {
+ assert(!island.empty());
+
+ /* Get the memory owner of this key or the device with the lowest memory usage when new */
+ SubDevice *owner_sub = island.front();
+ foreach (SubDevice *island_sub, island) {
+ if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+ (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+ owner_sub = island_sub;
+ }
+ }
+ return owner_sub;
+ }
+
+ inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+ {
+ return find_matching_mem_device(key, sub)->ptr_map[key];
+ }
+
+ void mem_alloc(device_memory &mem) override
+ {
+ device_ptr key = unique_key++;
+
+ assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+ /* The remaining memory types can be distributed across devices */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ owner_sub->device->mem_alloc(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size);
+ }
+
+ void mem_copy_to(device_memory &mem) override
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ /* The tile buffers are allocated on each device (see below), so copy to all of them */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_copy_to(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+
+ if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+ /* Need to create texture objects and update pointer in kernel globals on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+ {
+ device_ptr key = mem.device_pointer;
+ int i = 0, sub_h = h / devices.size();
+
+ foreach (SubDevice &sub, devices) {
+ int sy = y + i * sub_h;
+ int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+ SubDevice *owner_sub = find_matching_mem_device(key, sub);
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
+
+ owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+ i++;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ }
+
+ void mem_zero(device_memory &mem) override
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_zero(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_free(device_memory &mem) override
+ {
+ device_ptr key = mem.device_pointer;
+ size_t existing_size = mem.device_size;
+
+ /* Free memory that was allocated for all devices (see above) on each device */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_free(mem);
+ owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+ if (mem.type == MEM_TEXTURE) {
+ /* Free texture objects on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_free(mem);
+ }
+ }
+ }
+ }
+
+ mem.device = this;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+ stats.mem_free(existing_size);
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size) override
+ {
+ foreach (SubDevice &sub, devices)
+ sub.device->const_copy_to(name, host, size);
+ }
+
+ int device_number(Device *sub_device) override
+ {
+ int i = 0;
+
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device)
+ return i;
+ i++;
+ }
+
+ return -1;
+ }
+
+ virtual void foreach_device(const function<void(Device *)> &callback) override
+ {
+ foreach (SubDevice &sub, devices) {
+ sub.device->foreach_device(callback);
+ }
+ }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+ return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/device/multi/device.h
index 192d01444ba..6e121014a1f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/device/multi/device.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_background.h"
+#pragma once
-#define KERNEL_NAME indirect_background
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/device.h"
-# include "device/device_denoising.h"
-# include "device/device_split_kernel.h"
-
-# include "util/util_map.h"
-# include "util/util_param.h"
-# include "util/util_string.h"
-# include "util/util_task.h"
-
-# include "clew.h"
-
-# include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-# define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-# undef clEnqueueNDRangeKernel
-# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-
-# undef clEnqueueWriteBuffer
-# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-
-# undef clEnqueueReadBuffer
-# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
- OpenCLPlatformDevice(cl_platform_id platform_id,
- const string &platform_name,
- cl_device_id device_id,
- cl_device_type device_type,
- const string &device_name,
- const string &hardware_id,
- const string &device_extensions)
- : platform_id(platform_id),
- platform_name(platform_name),
- device_id(device_id),
- device_type(device_type),
- device_name(device_name),
- hardware_id(hardware_id),
- device_extensions(device_extensions)
- {
- }
- cl_platform_id platform_id;
- string platform_name;
- cl_device_id device_id;
- cl_device_type device_type;
- string device_name;
- string hardware_id;
- string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
- static cl_device_type device_type();
- static bool use_debug();
- static bool device_supported(const string &platform_name, const cl_device_id device_id);
- static bool platform_version_check(cl_platform_id platform, string *error = NULL);
- static bool device_version_check(cl_device_id device, string *error = NULL);
- static bool get_device_version(cl_device_id device,
- int *r_major,
- int *r_minor,
- string *error = NULL);
- static string get_hardware_id(const string &platform_name, cl_device_id device_id);
- static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
- /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
- /* Platform information. */
- static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
- static cl_uint get_num_platforms();
-
- static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
- static vector<cl_platform_id> get_platforms();
-
- static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
- static string get_platform_name(cl_platform_id platform_id);
-
- static bool get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- cl_uint *num_devices,
- cl_int *error = NULL);
- static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
- static bool get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- vector<cl_device_id> *device_ids,
- cl_int *error = NULL);
- static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type);
-
- /* Device information. */
- static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
- static string get_device_name(cl_device_id device_id);
-
- static bool get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int *error = NULL);
-
- static string get_device_extensions(cl_device_id device_id);
-
- static bool get_device_type(cl_device_id device_id,
- cl_device_type *device_type,
- cl_int *error = NULL);
- static cl_device_type get_device_type(cl_device_id device_id);
-
- static bool get_driver_version(cl_device_id device_id,
- int *major,
- int *minor,
- cl_int *error = NULL);
-
- static int mem_sub_ptr_alignment(cl_device_id device_id);
-
- /* Get somewhat more readable device name.
- * Main difference is AMD OpenCL here which only gives code name
- * for the regular device name. This will give more sane device
- * name using some extensions.
- */
- static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
- struct Slot {
- struct ProgramEntry {
- ProgramEntry();
- ProgramEntry(const ProgramEntry &rhs);
- ~ProgramEntry();
- cl_program program;
- thread_mutex *mutex;
- };
-
- Slot();
- Slot(const Slot &rhs);
- ~Slot();
-
- thread_mutex *context_mutex;
- cl_context context;
- typedef map<ustring, ProgramEntry> EntryMap;
- EntryMap programs;
- };
-
- /* key is combination of platform ID and device ID */
- typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
- /* map of Slot objects */
- typedef map<PlatformDevicePair, Slot> CacheMap;
- CacheMap cache;
-
- /* MD5 hash of the kernel source. */
- string kernel_md5;
-
- thread_mutex cache_lock;
- thread_mutex kernel_md5_lock;
-
- /* lazy instantiate */
- static OpenCLCache &global_instance();
-
- public:
- enum ProgramName {
- OCL_DEV_BASE_PROGRAM,
- OCL_DEV_MEGAKERNEL_PROGRAM,
- };
-
- /* Lookup context in the cache. If this returns NULL, slot_locker
- * will be holding a lock for the cache. slot_locker should refer to a
- * default constructed thread_scoped_lock. */
- static cl_context get_context(cl_platform_id platform,
- cl_device_id device,
- thread_scoped_lock &slot_locker);
- /* Same as above. */
- static cl_program get_program(cl_platform_id platform,
- cl_device_id device,
- ustring key,
- thread_scoped_lock &slot_locker);
-
- /* Store context in the cache. You MUST have tried to get the item before storing to it. */
- static void store_context(cl_platform_id platform,
- cl_device_id device,
- cl_context context,
- thread_scoped_lock &slot_locker);
- /* Same as above. */
- static void store_program(cl_platform_id platform,
- cl_device_id device,
- cl_program program,
- ustring key,
- thread_scoped_lock &slot_locker);
-
- static string get_kernel_md5();
-};
-
-# define opencl_device_assert(device, stmt) \
- { \
- cl_int err = stmt; \
-\
- if (err != CL_SUCCESS) { \
- string message = string_printf( \
- "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if ((device)->error_message() == "") { \
- (device)->set_error(message); \
- } \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } \
- (void)0
-
-# define opencl_assert(stmt) \
- { \
- cl_int err = stmt; \
-\
- if (err != CL_SUCCESS) { \
- string message = string_printf( \
- "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if (error_msg == "") { \
- error_msg = message; \
- } \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } \
- (void)0
-
-class OpenCLDevice : public Device {
- public:
- DedicatedTaskPool task_pool;
-
- /* Task pool for required kernels (base, AO kernels during foreground rendering) */
- TaskPool load_required_kernel_task_pool;
- /* Task pool for optional kernels (feature kernels during foreground rendering) */
- TaskPool load_kernel_task_pool;
- std::atomic<int> load_kernel_num_compiling;
-
- cl_context cxContext;
- cl_command_queue cqCommandQueue;
- cl_platform_id cpPlatform;
- cl_device_id cdDevice;
- cl_int ciErr;
- int device_num;
-
- class OpenCLProgram {
- public:
- OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
- {
- }
- OpenCLProgram(OpenCLDevice *device,
- const string &program_name,
- const string &kernel_name,
- const string &kernel_build_options,
- bool use_stdout = true);
- ~OpenCLProgram();
-
- void add_kernel(ustring name);
-
- /* Try to load the program from device cache or disk */
- bool load();
- /* Compile the kernel (first separate, fail-back to local). */
- void compile();
- /* Create the OpenCL kernels after loading or compiling */
- void create_kernels();
-
- bool is_loaded() const
- {
- return loaded;
- }
- const string &get_log() const
- {
- return log;
- }
- void report_error();
-
- /* Wait until this kernel is available to be used
- * It will return true when the kernel is available.
- * It will return false when the kernel is not available
- * or could not be loaded. */
- bool wait_for_availability();
-
- cl_kernel operator()();
- cl_kernel operator()(ustring name);
-
- void release();
-
- private:
- bool build_kernel(const string *debug_src);
- /* Build the program by calling the own process.
- * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
- * build calls internally if they come from the same process.
- * If that is not supported, this function just returns false.
- */
- bool compile_separate(const string &clbin);
- /* Build the program by calling OpenCL directly. */
- bool compile_kernel(const string *debug_src);
- /* Loading and saving the program from/to disk. */
- bool load_binary(const string &clbin, const string *debug_src = NULL);
- bool save_binary(const string &clbin);
-
- void add_log(const string &msg, bool is_debug);
- void add_error(const string &msg);
-
- bool loaded;
- bool needs_compiling;
-
- cl_program program;
- OpenCLDevice *device;
-
- /* Used for the OpenCLCache key. */
- string program_name;
-
- string kernel_file, kernel_build_options, device_md5;
-
- bool use_stdout;
- string log, error_msg;
- string compile_output;
-
- map<ustring, cl_kernel> kernels;
- };
-
- /* Container for all types of split programs. */
- class OpenCLSplitPrograms {
- public:
- OpenCLDevice *device;
- OpenCLProgram program_split;
- OpenCLProgram program_lamp_emission;
- OpenCLProgram program_do_volume;
- OpenCLProgram program_indirect_background;
- OpenCLProgram program_shader_eval;
- OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
- OpenCLProgram program_subsurface_scatter;
- OpenCLProgram program_direct_lighting;
- OpenCLProgram program_shadow_blocked_ao;
- OpenCLProgram program_shadow_blocked_dl;
-
- OpenCLSplitPrograms(OpenCLDevice *device);
- ~OpenCLSplitPrograms();
-
- /* Load the kernels and put the created kernels in the given
- * `programs` parameter. */
- void load_kernels(vector<OpenCLProgram *> &programs,
- const DeviceRequestedFeatures &requested_features);
- };
-
- DeviceSplitKernel *split_kernel;
-
- OpenCLProgram base_program;
- OpenCLProgram bake_program;
- OpenCLProgram displace_program;
- OpenCLProgram background_program;
- OpenCLProgram denoising_program;
-
- OpenCLSplitPrograms kernel_programs;
-
- typedef map<string, device_vector<uchar> *> ConstMemMap;
- typedef map<string, device_ptr> MemMap;
-
- ConstMemMap const_mem_map;
- MemMap mem_map;
-
- bool device_initialized;
- string platform_name;
- string device_name;
-
- bool opencl_error(cl_int err);
- void opencl_error(const string &message);
- void opencl_assert_err(cl_int err, const char *where);
-
- OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
- ~OpenCLDevice();
-
- static void CL_CALLBACK context_notify_callback(const char *err_info,
- const void * /*private_info*/,
- size_t /*cb*/,
- void *user_data);
-
- bool opencl_version_check();
- OpenCLSplitPrograms *get_split_programs();
-
- string device_md5_hash(string kernel_custom_build_options = "");
- bool load_kernels(const DeviceRequestedFeatures &requested_features);
- void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
- bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
- DeviceKernelStatus get_active_kernel_switch_state();
-
- /* Get the name of the opencl program for the given kernel */
- const string get_opencl_program_name(const string &kernel_name);
- /* Get the program file name to compile (*.cl) for the given kernel */
- const string get_opencl_program_filename(const string &kernel_name);
- string get_build_options(const DeviceRequestedFeatures &requested_features,
- const string &opencl_program_name);
- /* Enable the default features to reduce recompilation events */
- void enable_default_features(DeviceRequestedFeatures &features);
-
- void mem_alloc(device_memory &mem);
- void mem_copy_to(device_memory &mem);
- void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
- void mem_zero(device_memory &mem);
- void mem_free(device_memory &mem);
-
- int mem_sub_ptr_alignment();
-
- void const_copy_to(const char *name, void *host, size_t size);
- void global_alloc(device_memory &mem);
- void global_free(device_memory &mem);
- void tex_alloc(device_texture &mem);
- void tex_free(device_texture &mem);
-
- size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel,
- size_t w,
- size_t h,
- bool x_workgroups = false,
- size_t max_workgroup_size = -1);
- void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
- void film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half);
- void shader(DeviceTask &task);
- void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
- void bake(DeviceTask &task, RenderTile &tile);
-
- void denoise(RenderTile &tile, DenoisingTask &denoising);
-
- int get_split_task_count(DeviceTask & /*task*/)
- {
- return 1;
- }
-
- void task_add(DeviceTask &task)
- {
- task_pool.push([=] {
- DeviceTask task_copy = task;
- thread_run(task_copy);
- });
- }
-
- void task_wait()
- {
- task_pool.wait();
- }
-
- void task_cancel()
- {
- task_pool.cancel();
- }
-
- void thread_run(DeviceTask &task);
-
- virtual BVHLayoutMask get_bvh_layout_mask() const
- {
- return BVH_LAYOUT_BVH2;
- }
-
- virtual bool show_samples() const
- {
- return true;
- }
-
- protected:
- string kernel_build_options(const string *debug_src = NULL);
-
- void mem_zero_kernel(device_ptr ptr, size_t size);
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task);
- bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task);
- bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task);
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task);
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task);
- bool denoising_write_feature(int to_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task);
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
-
- device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
- void mem_free_sub_ptr(device_ptr ptr);
-
- class ArgumentWrapper {
- public:
- ArgumentWrapper() : size(0), pointer(NULL)
- {
- }
-
- ArgumentWrapper(device_memory &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_vector<T> &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_only_memory<T> &argument)
- : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
- {
- }
- template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
- {
- }
-
- ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
- {
- }
-
- ArgumentWrapper(float argument)
- : size(sizeof(float)), float_value(argument), pointer(&float_value)
- {
- }
-
- size_t size;
- int int_value;
- float float_value;
- void *pointer;
- };
-
- /* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
- int kernel_set_args(cl_kernel kernel,
- int start_argument_index,
- const ArgumentWrapper &arg1 = ArgumentWrapper(),
- const ArgumentWrapper &arg2 = ArgumentWrapper(),
- const ArgumentWrapper &arg3 = ArgumentWrapper(),
- const ArgumentWrapper &arg4 = ArgumentWrapper(),
- const ArgumentWrapper &arg5 = ArgumentWrapper(),
- const ArgumentWrapper &arg6 = ArgumentWrapper(),
- const ArgumentWrapper &arg7 = ArgumentWrapper(),
- const ArgumentWrapper &arg8 = ArgumentWrapper(),
- const ArgumentWrapper &arg9 = ArgumentWrapper(),
- const ArgumentWrapper &arg10 = ArgumentWrapper(),
- const ArgumentWrapper &arg11 = ArgumentWrapper(),
- const ArgumentWrapper &arg12 = ArgumentWrapper(),
- const ArgumentWrapper &arg13 = ArgumentWrapper(),
- const ArgumentWrapper &arg14 = ArgumentWrapper(),
- const ArgumentWrapper &arg15 = ArgumentWrapper(),
- const ArgumentWrapper &arg16 = ArgumentWrapper(),
- const ArgumentWrapper &arg17 = ArgumentWrapper(),
- const ArgumentWrapper &arg18 = ArgumentWrapper(),
- const ArgumentWrapper &arg19 = ArgumentWrapper(),
- const ArgumentWrapper &arg20 = ArgumentWrapper(),
- const ArgumentWrapper &arg21 = ArgumentWrapper(),
- const ArgumentWrapper &arg22 = ArgumentWrapper(),
- const ArgumentWrapper &arg23 = ArgumentWrapper(),
- const ArgumentWrapper &arg24 = ArgumentWrapper(),
- const ArgumentWrapper &arg25 = ArgumentWrapper(),
- const ArgumentWrapper &arg26 = ArgumentWrapper(),
- const ArgumentWrapper &arg27 = ArgumentWrapper(),
- const ArgumentWrapper &arg28 = ArgumentWrapper(),
- const ArgumentWrapper &arg29 = ArgumentWrapper(),
- const ArgumentWrapper &arg30 = ArgumentWrapper(),
- const ArgumentWrapper &arg31 = ArgumentWrapper(),
- const ArgumentWrapper &arg32 = ArgumentWrapper(),
- const ArgumentWrapper &arg33 = ArgumentWrapper());
-
- void release_kernel_safe(cl_kernel kernel);
- void release_mem_object_safe(cl_mem mem);
- void release_program_safe(cl_program program);
-
- /* ** Those guys are for working around some compiler-specific bugs ** */
-
- cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
- void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
- MemoryManager memory_manager;
- friend class MemoryManager;
-
- static_assert_align(TextureInfo, 16);
- device_vector<TextureInfo> texture_info;
-
- typedef map<string, device_memory *> TexturesMap;
- TexturesMap textures;
-
- bool textures_need_update;
-
- protected:
- void flush_texture_buffers();
-
- friend class OpenCLSplitKernel;
- friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/opencl/device_opencl.h"
-
-# include "kernel/kernel_types.h"
-# include "kernel/split/kernel_split_data_types.h"
-
-# include "util/util_algorithm.h"
-# include "util/util_debug.h"
-# include "util/util_foreach.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
- texture_slot_t(const string &name, int slot) : name(name), slot(slot)
- {
- }
- string name;
- int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
- "denoising "
- "base "
- "background "
- "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
- "data_init "
- "path_init "
- "state_buffer_size "
- "scene_intersect "
- "queue_enqueue "
- "shader_setup "
- "shader_sort "
- "enqueue_inactive "
- "next_iteration_setup "
- "indirect_subsurface "
- "buffer_update "
- "adaptive_stopping "
- "adaptive_filter_x "
- "adaptive_filter_y "
- "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
- if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
- return kernel_name;
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "split_bundle";
- }
- else {
- return "split_" + kernel_name;
- }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
- if (kernel_name == "denoising") {
- return "filter.cl";
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "kernel_split_bundle.cl";
- }
- else {
- return "kernel_" + kernel_name + ".cl";
- }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
- features.use_transparent = true;
- features.use_shadow_tricks = true;
- features.use_principled = true;
- features.use_denoising = true;
-
- if (!background) {
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_hair = true;
- features.use_subsurface = true;
- features.use_camera_motion = false;
- features.use_object_motion = false;
- }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
- const string &opencl_program_name)
-{
- /* first check for non-split kernel programs */
- if (opencl_program_name == "base" || opencl_program_name == "denoising") {
- return "";
- }
- else if (opencl_program_name == "bake") {
- /* Note: get_build_options for bake is only requested when baking is enabled.
- * displace and background are always requested.
- * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_hair = true;
- features.use_subsurface = true;
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "displace") {
- /* As displacement does not use any nodes from the Shading group (eg BSDF).
- * We disable all features that are related to shading. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_baking = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_subsurface = false;
- features.use_volume = false;
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_denoising = false;
- features.use_principled = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "background") {
- /* Background uses Background shading
- * It is save to disable shadow features, subsurface and volumetric. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_baking = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_denoising = false;
- /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
- * Perhaps we should remove them in UI as it does not make any sense when
- * rendering background. */
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_subsurface = false;
- features.use_volume = false;
- features.use_shader_raytrace = false;
- features.use_patch_evaluation = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
-
- string build_options = "-D__SPLIT_KERNEL__ ";
- /* Set compute device build option. */
- cl_device_type device_type;
- OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
- assert(this->ciErr == CL_SUCCESS);
- if (device_type == CL_DEVICE_TYPE_GPU) {
- build_options += "-D__COMPUTE_DEVICE_GPU__ ";
- }
-
- DeviceRequestedFeatures nofeatures;
- enable_default_features(nofeatures);
-
- /* Add program specific optimized compile directives */
- if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
- build_options += nofeatures.get_build_options();
- }
- else {
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
-
- /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
- * this also makes sure that the kernels that are build during baking can be reused
- * when not doing any baking. */
- features.use_baking = false;
-
- /* Do not vary on shaders when program doesn't do any shading.
- * We have bundled them in a single program. */
- if (opencl_program_name == "split_bundle") {
- features.max_nodes_group = 0;
- features.nodes_features = 0;
- features.use_shader_raytrace = false;
- }
-
- /* No specific settings, just add the regular ones */
- build_options += features.get_build_options();
- }
-
- return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
- device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
- program_split.release();
- program_lamp_emission.release();
- program_do_volume.release();
- program_indirect_background.release();
- program_shader_eval.release();
- program_holdout_emission_blurring_pathtermination_ao.release();
- program_subsurface_scatter.release();
- program_direct_lighting.release();
- program_shadow_blocked_ao.release();
- program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
- vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
- if (!requested_features.use_baking) {
-# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
- program_split.add_kernel(ustring("path_trace_" #kernel_name));
-# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
- const string program_name_##kernel_name = "split_" #kernel_name; \
- program_##kernel_name = OpenCLDevice::OpenCLProgram( \
- device, \
- program_name_##kernel_name, \
- "kernel_" #kernel_name ".cl", \
- device->get_build_options(requested_features, program_name_##kernel_name)); \
- program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
- programs.push_back(&program_##kernel_name);
-
- /* Ordered with most complex kernels first, to reduce overall compile time. */
- ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
- ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
- ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
- if (requested_features.use_volume) {
- ADD_SPLIT_KERNEL_PROGRAM(do_volume);
- }
- ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
- ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
- ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
- /* Quick kernels bundled in a single program to reduce overhead of starting
- * Blender processes. */
- program_split = OpenCLDevice::OpenCLProgram(
- device,
- "split_bundle",
- "kernel_split_bundle.cl",
- device->get_build_options(requested_features, "split_bundle"));
-
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
- programs.push_back(&program_split);
-
-# undef ADD_SPLIT_KERNEL_PROGRAM
-# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
- }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
- ccl_constant KernelData *data;
- ccl_global char *buffers[8];
-
-# define KERNEL_TEX(type, name) TextureInfo name;
-# include "kernel/kernel_textures.h"
-# undef KERNEL_TEX
- SplitData split_data;
- SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-} // namespace
-
-struct CachedSplitMemory {
- int id;
- device_memory *split_data;
- device_memory *ray_state;
- device_memory *queue_index;
- device_memory *use_queues_flag;
- device_memory *work_pools;
- device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
- OpenCLDevice *device;
- OpenCLDevice::OpenCLProgram program;
- CachedSplitMemory &cached_memory;
- int cached_id;
-
- OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
- : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
- {
- }
-
- ~OpenCLSplitKernelFunction()
- {
- program.release();
- }
-
- virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
- {
- if (cached_id != cached_memory.id) {
- cl_uint start_arg_index = device->kernel_set_args(
- program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
- device->set_kernel_arg_buffers(program(), &start_arg_index);
-
- start_arg_index += device->kernel_set_args(program(),
- start_arg_index,
- *cached_memory.queue_index,
- *cached_memory.use_queues_flag,
- *cached_memory.work_pools,
- *cached_memory.buffer);
-
- cached_id = cached_memory.id;
- }
-
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- program(),
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- return true;
- }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
- OpenCLDevice *device;
- CachedSplitMemory cached_memory;
-
- public:
- explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
- {
- }
-
- virtual SplitKernelFunction *get_split_kernel_function(
- const string &kernel_name, const DeviceRequestedFeatures &requested_features)
- {
- OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
- const string program_name = device->get_opencl_program_name(kernel_name);
- kernel->program = OpenCLDevice::OpenCLProgram(
- device,
- program_name,
- device->get_opencl_program_filename(kernel_name),
- device->get_build_options(requested_features, program_name));
-
- kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
- kernel->program.load();
-
- if (!kernel->program.is_loaded()) {
- delete kernel;
- return NULL;
- }
-
- return kernel;
- }
-
- virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
- {
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
-
- uint threads = num_threads;
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_state_buffer_size = programs->program_split(
- ustring("path_trace_state_buffer_size"));
- device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
- size_t global_size = 64;
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_state_buffer_size,
- 1,
- NULL,
- &global_size,
- NULL,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return 0;
- }
-
- return size;
- }
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
- RenderTile &rtile,
- int num_global_elements,
- device_memory &kernel_globals,
- device_memory &kernel_data,
- device_memory &split_data,
- device_memory &ray_state,
- device_memory &queue_index,
- device_memory &use_queues_flag,
- device_memory &work_pool_wgs)
- {
- cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
- /* Set the range of samples to be processed for every ray in
- * path-regeneration logic.
- */
- cl_int start_sample = rtile.start_sample;
- cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
- cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
- 0,
- kernel_globals,
- kernel_data,
- split_data,
- num_global_elements,
- ray_state);
-
- device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
- start_arg_index += device->kernel_set_args(kernel_data_init,
- start_arg_index,
- start_sample,
- end_sample,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- queue_index,
- dQueue_size,
- use_queues_flag,
- work_pool_wgs,
- rtile.num_samples,
- rtile.buffer);
-
- /* Enqueue ckPathTraceKernel_data_init kernel. */
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_data_init,
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if (device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- cached_memory.split_data = &split_data;
- cached_memory.ray_state = &ray_state;
- cached_memory.queue_index = &queue_index;
- cached_memory.use_queues_flag = &use_queues_flag;
- cached_memory.work_pools = &work_pool_wgs;
- cached_memory.buffer = &rtile.buffer;
- cached_memory.id++;
-
- return true;
- }
-
- virtual int2 split_kernel_local_size()
- {
- return make_int2(64, 1);
- }
-
- virtual int2 split_kernel_global_size(device_memory &kg,
- device_memory &data,
- DeviceTask & /*task*/)
- {
- cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
- /* Use small global size on CPU devices as it seems to be much faster. */
- if (type == CL_DEVICE_TYPE_CPU) {
- VLOG(1) << "Global size: (64, 64).";
- return make_int2(64, 64);
- }
-
- cl_ulong max_buffer_size;
- clGetDeviceInfo(
- device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if (DebugFlags().opencl.mem_limit) {
- max_buffer_size = min(max_buffer_size,
- cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
- }
-
- VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
- << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
- /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
- max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
- int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
- (int)sqrt(num_elements));
-
- if (device->info.description.find("Intel") != string::npos) {
- global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
- }
-
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
- }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
- if (err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
- return true;
- }
-
- return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
- if (err != CL_SUCCESS) {
- string message = string_printf(
- "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
- if (error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
-# ifndef NDEBUG
- abort();
-# endif
- }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
- : Device(info, stats, profiler, background),
- load_kernel_num_compiling(0),
- kernel_programs(this),
- memory_manager(this),
- texture_info(this, "__texture_info", MEM_GLOBAL)
-{
- cpPlatform = NULL;
- cdDevice = NULL;
- cxContext = NULL;
- cqCommandQueue = NULL;
- device_initialized = false;
- textures_need_update = true;
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if (usable_devices.size() == 0) {
- opencl_error("OpenCL: no devices found.");
- return;
- }
- assert(info.num < usable_devices.size());
- OpenCLPlatformDevice &platform_device = usable_devices[info.num];
- device_num = info.num;
- cpPlatform = platform_device.platform_id;
- cdDevice = platform_device.device_id;
- platform_name = platform_device.platform_name;
- device_name = platform_device.device_name;
- VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
- << device_name << ".";
-
- {
- /* try to use cached context */
- thread_scoped_lock cache_locker;
- cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
- if (cxContext == NULL) {
- /* create context properties array to specify platform */
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
- /* create context */
- cxContext = clCreateContext(
- context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
- if (opencl_error(ciErr)) {
- opencl_error("OpenCL: clCreateContext failed");
- return;
- }
-
- /* cache it */
- OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
- }
- }
-
- cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
- if (opencl_error(ciErr)) {
- opencl_error("OpenCL: Error creating command queue");
- return;
- }
-
- /* Allocate this right away so that texture_info
- * is placed at offset 0 in the device memory buffers. */
- texture_info.resize(1);
- memory_manager.alloc("texture_info", texture_info);
-
- device_initialized = true;
-
- split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
- task_pool.cancel();
- load_required_kernel_task_pool.cancel();
- load_kernel_task_pool.cancel();
-
- memory_manager.free();
-
- ConstMemMap::iterator mt;
- for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
- delete mt->second;
- }
-
- base_program.release();
- bake_program.release();
- displace_program.release();
- background_program.release();
- denoising_program.release();
-
- if (cqCommandQueue)
- clReleaseCommandQueue(cqCommandQueue);
- if (cxContext)
- clReleaseContext(cxContext);
-
- delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
- const void * /*private_info*/,
- size_t /*cb*/,
- void *user_data)
-{
- string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
- fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
- string error;
- if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
- opencl_error(error);
- return false;
- }
- if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
- opencl_error(error);
- return false;
- }
- return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
- MD5Hash md5;
- char version[256], driver[256], name[256], vendor[256];
-
- clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
- clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
- md5.append((uint8_t *)vendor, strlen(vendor));
- md5.append((uint8_t *)version, strlen(version));
- md5.append((uint8_t *)name, strlen(name));
- md5.append((uint8_t *)driver, strlen(driver));
-
- string options = kernel_build_options();
- options += kernel_custom_build_options;
- md5.append((uint8_t *)options.c_str(), options.size());
-
- return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
- VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
- /* Verify if device was initialized. */
- if (!device_initialized) {
- fprintf(stderr, "OpenCL: failed to initialize device.\n");
- return false;
- }
-
- /* Verify we have right opencl version. */
- if (!opencl_version_check())
- return false;
-
- load_required_kernels(requested_features);
-
- vector<OpenCLProgram *> programs;
- kernel_programs.load_kernels(programs, requested_features);
-
- if (!requested_features.use_baking && requested_features.use_denoising) {
- denoising_program = OpenCLProgram(
- this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
- denoising_program.add_kernel(ustring("filter_divide_shadow"));
- denoising_program.add_kernel(ustring("filter_get_feature"));
- denoising_program.add_kernel(ustring("filter_write_feature"));
- denoising_program.add_kernel(ustring("filter_detect_outliers"));
- denoising_program.add_kernel(ustring("filter_combine_halves"));
- denoising_program.add_kernel(ustring("filter_construct_transform"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
- denoising_program.add_kernel(ustring("filter_nlm_blur"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
- denoising_program.add_kernel(ustring("filter_nlm_update_output"));
- denoising_program.add_kernel(ustring("filter_nlm_normalize"));
- denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
- denoising_program.add_kernel(ustring("filter_finalize"));
- programs.push_back(&denoising_program);
- }
-
- load_required_kernel_task_pool.wait_work();
-
- /* Parallel compilation of Cycles kernels, this launches multiple
- * processes to workaround OpenCL frameworks serializing the calls
- * internally within a single process. */
- foreach (OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_kernel_num_compiling++;
- load_kernel_task_pool.push([=] {
- program->compile();
- load_kernel_num_compiling--;
- });
- }
- }
- return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
- vector<OpenCLProgram *> programs;
- base_program = OpenCLProgram(
- this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
- base_program.add_kernel(ustring("convert_to_byte"));
- base_program.add_kernel(ustring("convert_to_half_float"));
- base_program.add_kernel(ustring("zero_buffer"));
- programs.push_back(&base_program);
-
- if (requested_features.use_true_displacement) {
- displace_program = OpenCLProgram(
- this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
- displace_program.add_kernel(ustring("displace"));
- programs.push_back(&displace_program);
- }
-
- if (requested_features.use_background_light) {
- background_program = OpenCLProgram(this,
- "background",
- "kernel_background.cl",
- get_build_options(requested_features, "background"));
- background_program.add_kernel(ustring("background"));
- programs.push_back(&background_program);
- }
-
- if (requested_features.use_baking) {
- bake_program = OpenCLProgram(
- this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
- bake_program.add_kernel(ustring("bake"));
- programs.push_back(&bake_program);
- }
-
- foreach (OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
- }
- }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
- if (requested_features.use_baking) {
- /* For baking, kernels have already been loaded in load_required_kernels(). */
- return true;
- }
-
- load_kernel_task_pool.wait_work();
- return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
- return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
- if (mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- size_t size = mem.memory_size();
-
- /* check there is enough memory available for the allocation */
- cl_ulong max_alloc_size = 0;
- clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
- if (DebugFlags().opencl.mem_limit) {
- max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
- }
-
- if (size > max_alloc_size) {
- string error = "Scene too complex to fit in available memory.";
- if (mem.name != NULL) {
- error += string_printf(" (allocating buffer %s failed.)", mem.name);
- }
- set_error(error);
-
- return;
- }
-
- cl_mem_flags mem_flag;
- void *mem_ptr = NULL;
-
- if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- /* Zero-size allocation might be invoked by render, but not really
- * supported by OpenCL. Using NULL as device pointer also doesn't really
- * work for some reason, so for the time being we'll use special case
- * will null_mem buffer.
- */
- if (size != 0) {
- mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
- opencl_assert_err(ciErr, "clCreateBuffer");
- }
- else {
- mem.device_pointer = 0;
- }
-
- stats.mem_alloc(size);
- mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- global_alloc(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- tex_alloc((device_texture &)mem);
- }
- else {
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* this is blocking */
- size_t size = mem.memory_size();
- if (size != 0) {
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- size,
- mem.host_pointer,
- 0,
- NULL,
- NULL));
- }
- }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
- size_t offset = elem * y * w;
- size_t size = elem * w * h;
- assert(size != 0);
- opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- offset,
- size,
- (uchar *)mem.host_pointer + offset,
- 0,
- NULL,
- NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
- base_program.wait_for_availability();
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
-
- cl_mem d_buffer = CL_MEM_PTR(mem);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
-
- while (d_offset < size) {
- d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
- ciErr = clEnqueueNDRangeKernel(
- cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
- d_offset += d_size;
- }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
- if (!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if (mem.device_pointer) {
- if (base_program.is_loaded()) {
- mem_zero_kernel(mem.device_pointer, mem.memory_size());
- }
-
- if (mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-
- if (!base_program.is_loaded()) {
- void *zero = mem.host_pointer;
-
- if (!mem.host_pointer) {
- zero = util_aligned_malloc(mem.memory_size(), 16);
- memset(zero, 0, mem.memory_size());
- }
-
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- mem.memory_size(),
- zero,
- 0,
- NULL,
- NULL));
-
- if (!mem.host_pointer) {
- util_aligned_free(zero);
- }
- }
- }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
- if (mem.type == MEM_GLOBAL) {
- global_free(mem);
- }
- else if (mem.type == MEM_TEXTURE) {
- tex_free((device_texture &)mem);
- }
- else {
- if (mem.device_pointer) {
- if (mem.device_pointer != 0) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
- }
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
- return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
- cl_mem_flags mem_flag;
- if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- cl_buffer_region info;
- info.origin = mem.memory_elements_size(offset);
- info.size = mem.memory_elements_size(size);
-
- device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
- CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
- opencl_assert_err(ciErr, "clCreateSubBuffer");
- return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
- if (device_pointer != 0) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
- }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
- ConstMemMap::iterator i = const_mem_map.find(name);
- device_vector<uchar> *data;
-
- if (i == const_mem_map.end()) {
- data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
- data->alloc(size);
- const_mem_map.insert(ConstMemMap::value_type(name, data));
- }
- else {
- data = i->second;
- }
-
- memcpy(data->data(), host, size);
- data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
- VLOG(1) << "Global memory allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- memory_manager.alloc(mem.name, mem);
- /* Set the pointer to non-null to keep code that inspects its value from thinking its
- * unallocated. */
- mem.device_pointer = 1;
- textures[mem.name] = &mem;
- textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
- if (mem.device_pointer) {
- mem.device_pointer = 0;
-
- if (memory_manager.free(mem)) {
- textures_need_update = true;
- }
-
- foreach (TexturesMap::value_type &value, textures) {
- if (value.second == &mem) {
- textures.erase(value.first);
- break;
- }
- }
- }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- memory_manager.alloc(mem.name, mem);
- /* Set the pointer to non-null to keep code that inspects its value from thinking its
- * unallocated. */
- mem.device_pointer = 1;
- textures[mem.name] = &mem;
- textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
- global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
- int r = global_size % group_size;
- return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
- cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
- size_t workgroup_size, max_work_items[3];
-
- clGetKernelWorkGroupInfo(
- kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
- clGetDeviceInfo(
- cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
- if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
- workgroup_size = max_workgroup_size;
- }
-
- /* Try to divide evenly over 2 dimensions. */
- size_t local_size[2];
- if (x_workgroups) {
- local_size[0] = workgroup_size;
- local_size[1] = 1;
- }
- else {
- size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
- local_size[0] = local_size[1] = sqrt_workgroup_size;
- }
-
- /* Some implementations have max size 1 on 2nd dimension. */
- if (local_size[1] > max_work_items[1]) {
- local_size[0] = workgroup_size / max_work_items[1];
- local_size[1] = max_work_items[1];
- }
-
- size_t global_size[2] = {global_size_round_up(local_size[0], w),
- global_size_round_up(local_size[1], h)};
-
- /* Vertical size of 1 is coming from bake/shade kernels where we should
- * not round anything up because otherwise we'll either be doing too
- * much work per pixel (if we don't check global ID on Y axis) or will
- * be checking for global ID to always have Y of 0.
- */
- if (h == 1) {
- global_size[h] = 1;
- }
-
- /* run kernel */
- opencl_assert(
- clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
- opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
- cl_mem ptr;
-
- MemMap::iterator i = mem_map.find(name);
- if (i != mem_map.end()) {
- ptr = CL_MEM_PTR(i->second);
- }
- else {
- ptr = 0;
- }
-
- opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
- flush_texture_buffers();
-
- memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
- if (!textures_need_update) {
- return;
- }
- textures_need_update = false;
-
- /* Setup slots for textures. */
- int num_slots = 0;
-
- vector<texture_slot_t> texture_slots;
-
-# define KERNEL_TEX(type, name) \
- if (textures.find(#name) != textures.end()) { \
- texture_slots.push_back(texture_slot_t(#name, num_slots)); \
- } \
- num_slots++;
-# include "kernel/kernel_textures.h"
-
- int num_data_slots = num_slots;
-
- foreach (TexturesMap::value_type &tex, textures) {
- string name = tex.first;
- device_memory *mem = tex.second;
-
- if (mem->type == MEM_TEXTURE) {
- const uint id = ((device_texture *)mem)->slot;
- texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
- num_slots = max(num_slots, num_data_slots + id + 1);
- }
- }
-
- /* Realloc texture descriptors buffer. */
- memory_manager.free(texture_info);
- texture_info.resize(num_slots);
- memory_manager.alloc("texture_info", texture_info);
-
- /* Fill in descriptors */
- foreach (texture_slot_t &slot, texture_slots) {
- device_memory *mem = textures[slot.name];
- TextureInfo &info = texture_info[slot.slot];
-
- MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
- if (mem->type == MEM_TEXTURE) {
- info = ((device_texture *)mem)->info;
- }
- else {
- memset(&info, 0, sizeof(TextureInfo));
- }
-
- info.data = desc.offset;
- info.cl_buffer = desc.device_buffer;
- }
-
- /* Force write of descriptors. */
- memory_manager.free(texture_info);
- memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
- flush_texture_buffers();
-
- if (task.type == DeviceTask::RENDER) {
- RenderTile tile;
- DenoisingTask denoising(this, task);
-
- /* Allocate buffer for kernel globals */
- device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- /* Keep rendering tiles until done. */
- while (task.acquire_tile(this, tile, task.tile_types)) {
- if (tile.task == RenderTile::PATH_TRACE) {
- assert(tile.task == RenderTile::PATH_TRACE);
- scoped_timer timer(&tile.buffers->render_time);
-
- split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
- }
- else if (tile.task == RenderTile::BAKE) {
- bake(task, tile);
- }
- else if (tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-
- task.release_tile(tile);
- }
-
- kgbuffer.free();
- }
- else if (task.type == DeviceTask::SHADER) {
- shader(task);
- }
- else if (task.type == DeviceTask::FILM_CONVERT) {
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- }
- else if (task.type == DeviceTask::DENOISE_BUFFER) {
- RenderTile tile;
- tile.x = task.x;
- tile.y = task.y;
- tile.w = task.w;
- tile.h = task.h;
- tile.buffer = task.buffer;
- tile.sample = task.sample + task.num_samples;
- tile.num_samples = task.num_samples;
- tile.start_sample = task.sample;
- tile.offset = task.offset;
- tile.stride = task.stride;
- tile.buffers = task.buffers;
-
- DenoisingTask denoising(this, task);
- denoise(tile, denoising);
- task.update_progress(&tile, tile.w * tile.h);
- }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
- device_ptr buffer,
- device_ptr rgba_byte,
- device_ptr rgba_half)
-{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
- cl_mem d_buffer = CL_MEM_PTR(buffer);
- cl_int d_x = task.x;
- cl_int d_y = task.y;
- cl_int d_w = task.w;
- cl_int d_h = task.h;
- cl_float d_sample_scale = 1.0f / (task.sample + 1);
- cl_int d_offset = task.offset;
- cl_int d_stride = task.stride;
-
- cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
- base_program(ustring("convert_to_half_float"));
-
- cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
- set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(ckFilmConvertKernel,
- start_arg_index,
- d_sample_scale,
- d_x,
- d_y,
- d_w,
- d_h,
- d_offset,
- d_stride);
-
- enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task)
-{
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
- int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
- device_sub_ptr blurDifference(
- task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
- device_sub_ptr weightAccum(
- task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
- cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem out_mem = CL_MEM_PTR(out_ptr);
- cl_mem scale_mem = NULL;
-
- mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
- mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
- cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
- kernel_set_args(ckNLMCalcDifference,
- 0,
- guide_mem,
- variance_mem,
- scale_mem,
- difference_mem,
- w,
- h,
- stride,
- pass_stride,
- r,
- channel_offset,
- 0,
- a,
- k_2);
- kernel_set_args(
- ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
- kernel_set_args(
- ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
- kernel_set_args(ckNLMUpdateOutput,
- 0,
- blurDifference_mem,
- image_mem,
- out_mem,
- weightAccum_mem,
- w,
- h,
- stride,
- pass_stride,
- channel_offset,
- r,
- f);
-
- enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
- kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
- enqueue_kernel(ckNLMNormalize, w, h);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- char use_time = task->buffer.use_time ? 1 : 0;
-
- cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
- int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterConstructTransform,
- arg_ofs,
- transform_mem,
- rank_mem,
- task->filter_area,
- task->rect,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- use_time,
- task->radius,
- task->pca_threshold);
-
- enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
-{
- cl_mem color_mem = CL_MEM_PTR(color_ptr);
- cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
- char use_time = task->buffer.use_time ? 1 : 0;
-
- int r = task->radius;
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2 * r + 1) * (2 * r + 1);
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
- device_sub_ptr blurDifference(
- task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- kernel_set_args(ckNLMCalcDifference,
- 0,
- color_mem,
- color_variance_mem,
- scale_mem,
- difference_mem,
- w,
- h,
- stride,
- pass_stride,
- r,
- pass_stride,
- frame_offset,
- 1.0f,
- task->nlm_k_2);
- kernel_set_args(
- ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
- kernel_set_args(
- ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
- kernel_set_args(ckNLMConstructGramian,
- 0,
- t,
- blurDifference_mem,
- buffer_mem,
- transform_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->reconstruction_state.filter_window,
- w,
- h,
- stride,
- pass_stride,
- r,
- 4,
- frame_offset,
- use_time);
-
- enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
- enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
-
- kernel_set_args(ckFinalize,
- 0,
- output_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->filter_area,
- task->reconstruction_state.buffer_params,
- task->render_buffer.samples);
- enqueue_kernel(ckFinalize, w, h);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect,
- DenoisingTask *task)
-{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
- kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
- enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task)
-{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
- cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
- cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
- int arg_ofs = kernel_set_args(
- ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterDivideShadow,
- arg_ofs,
- a_mem,
- b_mem,
- sample_variance_mem,
- sv_variance_mem,
- buffer_variance_mem,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
-{
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
- int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
- cl_mem buffers[9];
- for (int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
- }
- kernel_set_args(ckFilterGetFeature,
- arg_ofs,
- mean_offset,
- variance_offset,
- mean_mem,
- variance_mem,
- scale,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
-{
- cl_mem from_mem = CL_MEM_PTR(from_ptr);
- cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
- cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
- kernel_set_args(ckFilterWriteFeature,
- 0,
- task->render_buffer.samples,
- task->reconstruction_state.buffer_params,
- task->filter_area,
- from_mem,
- buffer_mem,
- out_offset,
- task->rect);
- enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
- return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
-{
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
- cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
- kernel_set_args(ckFilterDetectOutliers,
- 0,
- image_mem,
- variance_mem,
- depth_mem,
- output_mem,
- task->rect,
- task->buffer.pass_stride);
- enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
- return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
- denoising.functions.construct_transform = function_bind(
- &OpenCLDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(
- &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(
- &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(
- &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(
- &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(
- &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(
- &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(
- &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_input = CL_MEM_PTR(task.shader_input);
- cl_mem d_output = CL_MEM_PTR(task.shader_output);
- cl_int d_shader_eval_type = task.shader_eval_type;
- cl_int d_shader_filter = task.shader_filter;
- cl_int d_shader_x = task.shader_x;
- cl_int d_shader_w = task.shader_w;
- cl_int d_offset = task.offset;
-
- OpenCLDevice::OpenCLProgram *program = &background_program;
- if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- program = &displace_program;
- }
- program->wait_for_availability();
- cl_kernel kernel = (*program)();
-
- cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
- set_kernel_arg_buffers(kernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
- if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
- }
- start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
- for (int sample = 0; sample < task.num_samples; sample++) {
-
- if (task.get_cancel())
- break;
-
- kernel_set_args(kernel, start_arg_index, sample);
-
- enqueue_kernel(kernel, task.shader_w, 1);
-
- clFinish(cqCommandQueue);
-
- task.update_progress(NULL);
- }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
- scoped_timer timer(&rtile.buffers->render_time);
-
- /* Cast arguments to cl types. */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_int d_x = rtile.x;
- cl_int d_y = rtile.y;
- cl_int d_w = rtile.w;
- cl_int d_h = rtile.h;
- cl_int d_offset = rtile.offset;
- cl_int d_stride = rtile.stride;
-
- bake_program.wait_for_availability();
- cl_kernel kernel = bake_program();
-
- cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
- set_kernel_arg_buffers(kernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(
- kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
- int start_sample = rtile.start_sample;
- int end_sample = rtile.start_sample + rtile.num_samples;
-
- for (int sample = start_sample; sample < end_sample; sample++) {
- if (task.get_cancel()) {
- if (task.need_finish_queue == false)
- break;
- }
-
- kernel_set_args(kernel, start_arg_index, sample);
-
- enqueue_kernel(kernel, d_w, d_h);
- clFinish(cqCommandQueue);
-
- rtile.sample = sample + 1;
-
- task.update_progress(&rtile, rtile.w * rtile.h);
- }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
- /* Build with OpenCL 2.0 if available, this improves performance
- * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
- * Note that OpenCL selects the highest 1.x version by default,
- * only for 2.0 do we need the explicit compiler flag. */
- int version_major, version_minor;
- if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
- if (version_major >= 2) {
- /* This appears to trigger a driver bug in Radeon RX cards with certain
- * driver version, so don't use OpenCL 2.0 for those. */
- string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
- if (string_startswith(device_name, "Radeon RX 4") ||
- string_startswith(device_name, "Radeon (TM) RX 4") ||
- string_startswith(device_name, "Radeon RX 5") ||
- string_startswith(device_name, "Radeon (TM) RX 5")) {
- char version[256] = "";
- int driver_major, driver_minor;
- clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
- return !(driver_major == 3075 && driver_minor <= 12);
- }
- }
-
- return true;
- }
- }
-
- return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
- string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
- if (kernel_build_opencl_2(cdDevice)) {
- build_options += "-cl-std=CL2.0 ";
- }
-
- if (platform_name == "NVIDIA CUDA") {
- build_options +=
- "-D__KERNEL_OPENCL_NVIDIA__ "
- "-cl-nv-maxrregcount=32 "
- "-cl-nv-verbose ";
-
- uint compute_capability_major, compute_capability_minor;
- clGetDeviceInfo(cdDevice,
- CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
- sizeof(cl_uint),
- &compute_capability_major,
- NULL);
- clGetDeviceInfo(cdDevice,
- CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
- sizeof(cl_uint),
- &compute_capability_minor,
- NULL);
-
- build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
- compute_capability_major * 100 + compute_capability_minor * 10);
- }
-
- else if (platform_name == "Apple")
- build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
- else if (platform_name == "AMD Accelerated Parallel Processing")
- build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
- else if (platform_name == "Intel(R) OpenCL") {
- build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
- /* Options for gdb source level kernel debugging.
- * this segfaults on linux currently.
- */
- if (OpenCLInfo::use_debug() && debug_src)
- build_options += "-g -s \"" + *debug_src + "\" ";
- }
-
- if (info.has_half_images) {
- build_options += "-D__KERNEL_CL_KHR_FP16__ ";
- }
-
- if (OpenCLInfo::use_debug()) {
- build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
- }
-
-# ifdef WITH_NANOVDB
- if (info.has_nanovdb) {
- build_options += "-DWITH_NANOVDB ";
- }
-# endif
-
- return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
- int start_argument_index,
- const ArgumentWrapper &arg1,
- const ArgumentWrapper &arg2,
- const ArgumentWrapper &arg3,
- const ArgumentWrapper &arg4,
- const ArgumentWrapper &arg5,
- const ArgumentWrapper &arg6,
- const ArgumentWrapper &arg7,
- const ArgumentWrapper &arg8,
- const ArgumentWrapper &arg9,
- const ArgumentWrapper &arg10,
- const ArgumentWrapper &arg11,
- const ArgumentWrapper &arg12,
- const ArgumentWrapper &arg13,
- const ArgumentWrapper &arg14,
- const ArgumentWrapper &arg15,
- const ArgumentWrapper &arg16,
- const ArgumentWrapper &arg17,
- const ArgumentWrapper &arg18,
- const ArgumentWrapper &arg19,
- const ArgumentWrapper &arg20,
- const ArgumentWrapper &arg21,
- const ArgumentWrapper &arg22,
- const ArgumentWrapper &arg23,
- const ArgumentWrapper &arg24,
- const ArgumentWrapper &arg25,
- const ArgumentWrapper &arg26,
- const ArgumentWrapper &arg27,
- const ArgumentWrapper &arg28,
- const ArgumentWrapper &arg29,
- const ArgumentWrapper &arg30,
- const ArgumentWrapper &arg31,
- const ArgumentWrapper &arg32,
- const ArgumentWrapper &arg33)
-{
- int current_arg_index = 0;
-# define FAKE_VARARG_HANDLE_ARG(arg) \
- do { \
- if (arg.pointer != NULL) { \
- opencl_assert(clSetKernelArg( \
- kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
- ++current_arg_index; \
- } \
- else { \
- return current_arg_index; \
- } \
- } while (false)
- FAKE_VARARG_HANDLE_ARG(arg1);
- FAKE_VARARG_HANDLE_ARG(arg2);
- FAKE_VARARG_HANDLE_ARG(arg3);
- FAKE_VARARG_HANDLE_ARG(arg4);
- FAKE_VARARG_HANDLE_ARG(arg5);
- FAKE_VARARG_HANDLE_ARG(arg6);
- FAKE_VARARG_HANDLE_ARG(arg7);
- FAKE_VARARG_HANDLE_ARG(arg8);
- FAKE_VARARG_HANDLE_ARG(arg9);
- FAKE_VARARG_HANDLE_ARG(arg10);
- FAKE_VARARG_HANDLE_ARG(arg11);
- FAKE_VARARG_HANDLE_ARG(arg12);
- FAKE_VARARG_HANDLE_ARG(arg13);
- FAKE_VARARG_HANDLE_ARG(arg14);
- FAKE_VARARG_HANDLE_ARG(arg15);
- FAKE_VARARG_HANDLE_ARG(arg16);
- FAKE_VARARG_HANDLE_ARG(arg17);
- FAKE_VARARG_HANDLE_ARG(arg18);
- FAKE_VARARG_HANDLE_ARG(arg19);
- FAKE_VARARG_HANDLE_ARG(arg20);
- FAKE_VARARG_HANDLE_ARG(arg21);
- FAKE_VARARG_HANDLE_ARG(arg22);
- FAKE_VARARG_HANDLE_ARG(arg23);
- FAKE_VARARG_HANDLE_ARG(arg24);
- FAKE_VARARG_HANDLE_ARG(arg25);
- FAKE_VARARG_HANDLE_ARG(arg26);
- FAKE_VARARG_HANDLE_ARG(arg27);
- FAKE_VARARG_HANDLE_ARG(arg28);
- FAKE_VARARG_HANDLE_ARG(arg29);
- FAKE_VARARG_HANDLE_ARG(arg30);
- FAKE_VARARG_HANDLE_ARG(arg31);
- FAKE_VARARG_HANDLE_ARG(arg32);
- FAKE_VARARG_HANDLE_ARG(arg33);
-# undef FAKE_VARARG_HANDLE_ARG
- return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
- if (kernel) {
- clReleaseKernel(kernel);
- }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
- if (mem != NULL) {
- clReleaseMemObject(mem);
- }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
- if (program) {
- clReleaseProgram(program);
- }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
- return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
- ustring key,
- thread_scoped_lock &cache_locker)
-{
- OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
- Stats &stats,
- Profiler &profiler,
- bool background)
-{
- return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "util/util_foreach.h"
-
-# include "device/opencl/device_opencl.h"
-# include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
- allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
- bool need_realloc = false;
-
- /* Calculate total size and remove any freed. */
- size_t total_size = 0;
-
- for (int i = allocations.size() - 1; i >= 0; i--) {
- Allocation *allocation = allocations[i];
-
- /* Remove allocations that have been freed. */
- if (!allocation->mem || allocation->mem->memory_size() == 0) {
- allocation->device_buffer = NULL;
- allocation->size = 0;
-
- allocations.erase(allocations.begin() + i);
-
- need_realloc = true;
-
- continue;
- }
-
- /* Get actual size for allocation. */
- size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
- if (allocation->size != alloc_size) {
- /* Allocation is either new or resized. */
- allocation->size = alloc_size;
- allocation->needs_copy_to_device = true;
-
- need_realloc = true;
- }
-
- total_size += alloc_size;
- }
-
- /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
- total_size = std::max(total_size, (size_t)16);
-
- if (need_realloc) {
- cl_ulong max_buffer_size;
- clGetDeviceInfo(
- device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if (total_size > max_buffer_size) {
- device->set_error("Scene too complex to fit in available memory.");
- return;
- }
-
- device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
- "memory manager buffer");
-
- new_buffer->alloc_to_device(total_size);
-
- size_t offset = 0;
-
- foreach (Allocation *allocation, allocations) {
- if (allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device,
- clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(new_buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0,
- NULL,
- NULL));
-
- allocation->needs_copy_to_device = false;
- }
- else {
- /* Fast copy from memory already on device. */
- opencl_device_assert(device,
- clEnqueueCopyBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_MEM_PTR(new_buffer->device_pointer),
- allocation->desc.offset,
- offset,
- allocation->mem->memory_size(),
- 0,
- NULL,
- NULL));
- }
-
- allocation->desc.offset = offset;
- offset += allocation->size;
- }
-
- delete buffer;
-
- buffer = new_buffer;
- }
- else {
- assert(total_size == buffer->data_size);
-
- size_t offset = 0;
-
- foreach (Allocation *allocation, allocations) {
- if (allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device,
- clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0,
- NULL,
- NULL));
-
- allocation->needs_copy_to_device = false;
- }
-
- offset += allocation->size;
- }
- }
-
- /* Not really necessary, but seems to improve responsiveness for some reason. */
- clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
- buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
- DeviceBuffer *smallest = device_buffers;
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- if (device_buffer.size < smallest->size) {
- smallest = &device_buffer;
- }
- }
-
- return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
- }
-}
-
-void MemoryManager::free()
-{
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.free(device);
- }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
- Allocation &allocation = allocations[name];
-
- allocation.mem = &mem;
- allocation.needs_copy_to_device = true;
-
- if (!allocation.device_buffer) {
- DeviceBuffer *device_buffer = smallest_device_buffer();
- allocation.device_buffer = device_buffer;
-
- allocation.desc.device_buffer = device_buffer - device_buffers;
-
- device_buffer->add_allocation(allocation);
-
- device_buffer->size += mem.memory_size();
- }
-
- need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
- foreach (AllocationsMap::value_type &value, allocations) {
- Allocation &allocation = value.second;
- if (allocation.mem == &mem) {
-
- allocation.device_buffer->size -= mem.memory_size();
-
- allocation.mem = NULL;
- allocation.needs_copy_to_device = false;
-
- need_update = true;
- return true;
- }
- }
-
- return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
- update_device_memory();
-
- Allocation &allocation = allocations[name];
- return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
- if (!need_update) {
- return;
- }
-
- need_update = false;
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- device_buffer.update_device_memory(device);
- }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
- update_device_memory();
-
- foreach (DeviceBuffer &device_buffer, device_buffers) {
- if (device_buffer.buffer->device_pointer) {
- device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
- }
- else {
- device->kernel_set_args(kernel, (*narg)++);
- }
- }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
- static const int NUM_DEVICE_BUFFERS = 8;
-
- struct BufferDescriptor {
- uint device_buffer;
- cl_ulong offset;
- };
-
- private:
- struct DeviceBuffer;
-
- struct Allocation {
- device_memory *mem;
-
- DeviceBuffer *device_buffer;
- size_t size; /* Size of actual allocation, may be larger than requested. */
-
- BufferDescriptor desc;
-
- bool needs_copy_to_device;
-
- Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
- {
- }
- };
-
- struct DeviceBuffer {
- device_only_memory<uchar> *buffer;
- vector<Allocation *> allocations;
- size_t size; /* Size of all allocations. */
-
- DeviceBuffer() : buffer(NULL), size(0)
- {
- }
-
- ~DeviceBuffer()
- {
- delete buffer;
- buffer = NULL;
- }
-
- void add_allocation(Allocation &allocation);
-
- void update_device_memory(OpenCLDevice *device);
-
- void free(OpenCLDevice *device);
- };
-
- OpenCLDevice *device;
-
- DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
- typedef unordered_map<string, Allocation> AllocationsMap;
- AllocationsMap allocations;
-
- bool need_update;
-
- DeviceBuffer *smallest_device_buffer();
-
- public:
- MemoryManager(OpenCLDevice *device);
-
- void free(); /* Free all memory. */
-
- void alloc(const char *name, device_memory &mem);
- bool free(device_memory &mem);
-
- BufferDescriptor get_descriptor(string name);
-
- void update_device_memory();
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-# include "device/device_intern.h"
-# include "device/opencl/device_opencl.h"
-
-# include "util/util_debug.h"
-# include "util/util_logging.h"
-# include "util/util_md5.h"
-# include "util/util_path.h"
-# include "util/util_semaphore.h"
-# include "util/util_system.h"
-# include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
- : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
- delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
- : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
- delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
- static OpenCLCache instance;
- return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
- cl_device_id device,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- pair<CacheMap::iterator, bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
- Slot &slot = ins.first->second;
-
- /* create slot lock only while holding cache lock */
- if (!slot.context_mutex)
- slot.context_mutex = new thread_mutex;
-
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
-
- /* lock the slot */
- slot_locker = thread_scoped_lock(*slot.context_mutex);
-
- /* If the thing isn't cached */
- if (slot.context == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
-
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
-
- cl_int ciErr = clRetainContext(slot.context);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-
- return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
- cl_device_id device,
- ustring key,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- pair<CacheMap::iterator, bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
- Slot &slot = ins.first->second;
-
- pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
- Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
- Slot::ProgramEntry &entry = ins2.first->second;
-
- /* create slot lock only while holding cache lock */
- if (!entry.mutex)
- entry.mutex = new thread_mutex;
-
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
-
- /* lock the slot */
- slot_locker = thread_scoped_lock(*entry.mutex);
-
- /* If the thing isn't cached */
- if (entry.program == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
-
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
-
- cl_int ciErr = clRetainProgram(entry.program);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-
- return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
- cl_device_id device,
- cl_context context,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
- assert(device != NULL);
- assert(context != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- cache_lock.unlock();
-
- Slot &slot = i->second;
-
- /* sanity check */
- assert(i != self.cache.end());
- assert(slot.context == NULL);
-
- slot.context = context;
-
- /* unlock the slot */
- slot_locker.unlock();
-
- /* increment reference count in OpenCL.
- * The caller is going to release the object when done with it. */
- cl_int ciErr = clRetainContext(context);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
- cl_device_id device,
- cl_program program,
- ustring key,
- thread_scoped_lock &slot_locker)
-{
- assert(platform != NULL);
- assert(device != NULL);
- assert(program != NULL);
-
- OpenCLCache &self = global_instance();
-
- thread_scoped_lock cache_lock(self.cache_lock);
-
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- assert(i != self.cache.end());
- Slot &slot = i->second;
-
- Slot::EntryMap::iterator i2 = slot.programs.find(key);
- assert(i2 != slot.programs.end());
- Slot::ProgramEntry &entry = i2->second;
-
- assert(entry.program == NULL);
-
- cache_lock.unlock();
-
- entry.program = program;
-
- /* unlock the slot */
- slot_locker.unlock();
-
- /* Increment reference count in OpenCL.
- * The caller is going to release the object when done with it.
- */
- cl_int ciErr = clRetainProgram(program);
- assert(ciErr == CL_SUCCESS);
- (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
- OpenCLCache &self = global_instance();
- thread_scoped_lock lock(self.kernel_md5_lock);
-
- if (self.kernel_md5.empty()) {
- self.kernel_md5 = path_files_md5_hash(path_get("source"));
- }
- return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
- string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
- /* We compile kernels consisting of many files. unfortunately OpenCL
- * kernel caches do not seem to recognize changes in included files.
- * so we force recompile on changes by adding the md5 hash of all files.
- */
- source = path_source_replace_includes(source, path_get("source"));
- source += "\n// " + util_md5_string(source) + "\n";
- return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
- const string &program_name,
- const string &kernel_file,
- const string &kernel_build_options,
- bool use_stdout)
- : device(device),
- program_name(program_name),
- kernel_file(kernel_file),
- kernel_build_options(kernel_build_options),
- use_stdout(use_stdout)
-{
- loaded = false;
- needs_compiling = true;
- program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
- release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
- for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
- ++kernel) {
- if (kernel->second) {
- clReleaseKernel(kernel->second);
- kernel->second = NULL;
- }
- }
- if (program) {
- clReleaseProgram(program);
- program = NULL;
- }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
- if (!use_stdout) {
- log += msg + "\n";
- }
- else if (!debug) {
- printf("%s\n", msg.c_str());
- fflush(stdout);
- }
- else {
- VLOG(2) << msg;
- }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
- if (use_stdout) {
- fprintf(stderr, "%s\n", msg.c_str());
- }
- if (error_msg == "") {
- error_msg += "\n";
- }
- error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
- if (!kernels.count(name)) {
- kernels[name] = NULL;
- }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
- string build_options;
- build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
- VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
- cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
- /* show warnings even if build is successful */
- size_t ret_val_size = 0;
-
- clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
- if (ciErr != CL_SUCCESS) {
- add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
- ", errors in console.");
- }
-
- if (ret_val_size > 1) {
- vector<char> build_log(ret_val_size + 1);
- clGetProgramBuildInfo(
- program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
- build_log[ret_val_size] = '\0';
- /* Skip meaningless empty output from the NVidia compiler. */
- if (!(ret_val_size == 2 && build_log[0] == '\n')) {
- add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
- ciErr == CL_SUCCESS);
- }
- }
-
- return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
- string source = get_program_source(kernel_file);
-
- if (debug_src) {
- path_write_text(*debug_src, source);
- }
-
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_int ciErr;
-
- program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
- if (ciErr != CL_SUCCESS) {
- add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
- return false;
- }
-
- double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
-
- if (!build_kernel(debug_src))
- return false;
-
- double elapsed = time_dt() - starttime;
- add_log(
- string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
- false);
-
- return true;
-}
-
-static void escape_python_string(string &str)
-{
- /* Escape string to be passed as a Python raw string with '' quotes'. */
- string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
- /* Limit number of concurrent processes compiling, with a heuristic based
- * on total physical RAM and estimate of memory usage needed when compiling
- * with all Cycles features enabled.
- *
- * This is somewhat arbitrary as we don't know the actual available RAM or
- * how much the kernel compilation will needed depending on the features, but
- * better than not limiting at all. */
- static const int64_t GB = 1024LL * 1024LL * 1024LL;
- static const int64_t process_memory = 2 * GB;
- static const int64_t base_memory = 2 * GB;
- static const int64_t system_memory = system_physical_ram();
- static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
- return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
- /* Construct arguments. */
- vector<string> args;
- args.push_back("--background");
- args.push_back("--factory-startup");
- args.push_back("--python-expr");
-
- int device_platform_id = device->device_num;
- string device_name = device->device_name;
- string platform_name = device->platform_name;
- string build_options = device->kernel_build_options(NULL) + kernel_build_options;
- string kernel_file_escaped = kernel_file;
- string clbin_escaped = clbin;
-
- escape_python_string(device_name);
- escape_python_string(platform_name);
- escape_python_string(build_options);
- escape_python_string(kernel_file_escaped);
- escape_python_string(clbin_escaped);
-
- args.push_back(string_printf(
- "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
- device_platform_id,
- device_name.c_str(),
- platform_name.c_str(),
- build_options.c_str(),
- kernel_file_escaped.c_str(),
- clbin_escaped.c_str()));
-
- /* Limit number of concurrent processes compiling. */
- static thread_counting_semaphore semaphore(opencl_compile_process_limit());
- semaphore.acquire();
-
- /* Compile. */
- const double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
- const bool success = system_call_self(args);
- const double elapsed = time_dt() - starttime;
-
- semaphore.release();
-
- if (!success || !path_exists(clbin)) {
- return false;
- }
-
- add_log(
- string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
- false);
-
- return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
- int device_platform_id = std::stoi(parameters[0]);
- const string &device_name = parameters[1];
- const string &platform_name = parameters[2];
- const string &build_options = parameters[3];
- const string &kernel_file = parameters[4];
- const string &binary_path = parameters[5];
-
- if (clewInit() != CLEW_SUCCESS) {
- return false;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if (device_platform_id >= usable_devices.size()) {
- return false;
- }
-
- OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
- if (platform_device.platform_name != platform_name ||
- platform_device.device_name != device_name) {
- return false;
- }
-
- cl_platform_id platform = platform_device.platform_id;
- cl_device_id device = platform_device.device_id;
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
- cl_int err;
- cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
- if (err != CL_SUCCESS) {
- return false;
- }
-
- string source = get_program_source(kernel_file);
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
- bool result = false;
-
- if (err == CL_SUCCESS) {
- err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
- if (err == CL_SUCCESS) {
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
- if (size > 0) {
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
- result = path_write_binary(binary_path, binary);
- }
- }
- clReleaseProgram(program);
- }
-
- clReleaseContext(context);
-
- return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
- /* read binary into memory */
- vector<uint8_t> binary;
-
- if (!path_read_binary(clbin, binary)) {
- add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
- return false;
- }
-
- /* create program */
- cl_int status, ciErr;
- size_t size = binary.size();
- const uint8_t *bytes = &binary[0];
-
- program = clCreateProgramWithBinary(
- device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
- if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
- add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
- clewErrorString(status) + " " + clewErrorString(ciErr));
- return false;
- }
-
- if (!build_kernel(debug_src))
- return false;
-
- return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
- if (!size)
- return false;
-
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
-
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
- return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
- loaded = false;
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key, cache_locker);
- if (!program) {
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
- util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* If binary kernel exists already, try use it. */
- if (path_exists(clbin) && load_binary(clbin)) {
- /* Kernel loaded from binary, nothing to do. */
- add_log(string("Loaded program from ") + clbin + ".", true);
-
- /* Cache the program. */
- device->store_cached_kernel(program, cache_key, cache_locker);
- }
- else {
- add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
- cache_locker.unlock();
- }
- }
-
- if (program) {
- create_kernels();
- loaded = true;
- needs_compiling = false;
- }
-
- return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
- assert(device);
-
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key, cache_locker);
-
- if (!program) {
-
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
- util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* path to preprocessed source for debugging */
- string clsrc, *debug_src = NULL;
-
- if (OpenCLInfo::use_debug()) {
- clsrc = basename + ".cl";
- debug_src = &clsrc;
- }
-
- if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
- add_log(string("Built and loaded program from ") + clbin + ".", true);
- loaded = true;
- }
- else {
- if (DebugFlags().running_inside_blender) {
- add_log(string("Separate-process building of ") + clbin +
- " failed, will fall back to regular building.",
- true);
- }
-
- /* If does not exist or loading binary failed, compile kernel. */
- if (!compile_kernel(debug_src)) {
- needs_compiling = false;
- return;
- }
-
- /* Save binary for reuse. */
- if (!save_binary(clbin)) {
- add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
- }
- }
-
- /* Cache the program. */
- device->store_cached_kernel(program, cache_key, cache_locker);
- }
-
- create_kernels();
- needs_compiling = false;
- loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
- for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
- ++kernel) {
- assert(kernel->second == NULL);
- cl_int ciErr;
- string name = "kernel_ocl_" + kernel->first.string();
- kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
- if (device->opencl_error(ciErr)) {
- add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
- clewErrorString(ciErr));
- return;
- }
- }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
- add_log(string("Waiting for availability of ") + program_name + ".", true);
- while (needs_compiling) {
- time_sleep(0.1);
- }
- return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
- /* If loaded is true, there was no error. */
- if (loaded)
- return;
- /* if use_stdout is true, the error was already reported. */
- if (use_stdout)
- return;
-
- cerr << error_msg << endl;
- if (!compile_output.empty()) {
- cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
- cerr << compile_output << endl;
- }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
- assert(kernels.size() == 1);
- return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
- assert(kernels.count(name));
- return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
- switch (DebugFlags().opencl.device_type) {
- case DebugFlags::OpenCL::DEVICE_NONE:
- return 0;
- case DebugFlags::OpenCL::DEVICE_ALL:
- return CL_DEVICE_TYPE_ALL;
- case DebugFlags::OpenCL::DEVICE_DEFAULT:
- return CL_DEVICE_TYPE_DEFAULT;
- case DebugFlags::OpenCL::DEVICE_CPU:
- return CL_DEVICE_TYPE_CPU;
- case DebugFlags::OpenCL::DEVICE_GPU:
- return CL_DEVICE_TYPE_GPU;
- case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
- return CL_DEVICE_TYPE_ACCELERATOR;
- default:
- return CL_DEVICE_TYPE_ALL;
- }
-}
-
-bool OpenCLInfo::use_debug()
-{
- return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type)) {
- return false;
- }
- string device_name;
- if (!get_device_name(device_id, &device_name)) {
- return false;
- }
-
- int driver_major = 0;
- int driver_minor = 0;
- if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
- return false;
- }
- VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
- if (getenv("CYCLES_OPENCL_TEST")) {
- return true;
- }
-
- /* Allow Intel GPUs on Intel OpenCL platform. */
- if (platform_name.find("Intel") != string::npos) {
- if (device_type != CL_DEVICE_TYPE_GPU) {
- /* OpenCL on Intel CPU is not an officially supported configuration.
- * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
- return false;
- }
-
-# ifdef __APPLE__
- /* Apple uses own framework, which can also put Iris onto AMD frame-work.
- * This isn't supported configuration. */
- return false;
-# else
- if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
- return true;
- }
-# endif
- }
-
- if (platform_name == "AMD Accelerated Parallel Processing" &&
- device_type == CL_DEVICE_TYPE_GPU) {
- if (driver_major < 2236) {
- VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
- return false;
- }
- const char *blacklist[] = {/* GCN 1 */
- "Tahiti",
- "Pitcairn",
- "Capeverde",
- "Oland",
- "Hainan",
- NULL};
- for (int i = 0; blacklist[i] != NULL; i++) {
- if (device_name == blacklist[i]) {
- VLOG(1) << "AMD device " << device_name << " not supported";
- return false;
- }
- }
- return true;
- }
- if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
- return false;
- }
- return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- char version[256];
- clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
- if (error != NULL) {
- *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
- }
- return false;
- }
- if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if (error != NULL) {
- *error = string_printf(
- "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
- char version[256];
- clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
- if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
- if (error != NULL) {
- *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- if (!get_device_version(device, &major, &minor, error)) {
- return false;
- }
-
- if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if (error != NULL) {
- *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if (error != NULL) {
- *error = "";
- }
- return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
- if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
- /* Use cl_amd_device_topology extension. */
- cl_char topology[24];
- if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
- topology[0] == 1) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)topology[21],
- (unsigned int)topology[22],
- (unsigned int)topology[23]);
- }
- }
- else if (platform_name == "NVIDIA CUDA") {
- /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
- cl_int bus_id, slot_id;
- if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
- clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)(bus_id),
- (unsigned int)(slot_id >> 3),
- (unsigned int)(slot_id & 0x7));
- }
- }
- /* No general way to get a hardware ID from OpenCL => give up. */
- return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
- const cl_device_type device_type = OpenCLInfo::device_type();
- static bool first_time = true;
-# define FIRST_VLOG(severity) \
- if (first_time) \
- VLOG(severity)
-
- usable_devices->clear();
-
- if (device_type == 0) {
- FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
- first_time = false;
- return;
- }
-
- cl_int error;
- vector<cl_device_id> device_ids;
- vector<cl_platform_id> platform_ids;
-
- /* Get platforms. */
- if (!get_platforms(&platform_ids, &error)) {
- FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
- first_time = false;
- return;
- }
- if (platform_ids.size() == 0) {
- FIRST_VLOG(2) << "No OpenCL platforms were found.";
- first_time = false;
- return;
- }
- /* Devices are numbered consecutively across platforms. */
- for (int platform = 0; platform < platform_ids.size(); platform++) {
- cl_platform_id platform_id = platform_ids[platform];
- string platform_name;
- if (!get_platform_name(platform_id, &platform_name)) {
- FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
- continue;
- }
- FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
- if (!platform_version_check(platform_id)) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << " due to too old compiler version.";
- continue;
- }
- if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << ", failed to fetch of devices: " << string(clewErrorString(error));
- continue;
- }
- if (device_ids.size() == 0) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
- continue;
- }
- for (int num = 0; num < device_ids.size(); num++) {
- const cl_device_id device_id = device_ids[num];
- string device_name;
- if (!get_device_name(device_id, &device_name, &error)) {
- FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
- << ", ignoring.";
- continue;
- }
- if (!device_version_check(device_id)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
- continue;
- }
- if (device_supported(platform_name, device_id)) {
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type, &error)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name
- << ", failed to fetch device type:" << string(clewErrorString(error));
- continue;
- }
- string readable_device_name = get_readable_device_name(device_id);
- if (readable_device_name != device_name) {
- FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
- }
- FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
- string hardware_id = get_hardware_id(platform_name, device_id);
- string device_extensions = get_device_extensions(device_id);
- usable_devices->push_back(OpenCLPlatformDevice(platform_id,
- platform_name,
- device_id,
- device_type,
- readable_device_name,
- hardware_id,
- device_extensions));
- }
- else {
- FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
- }
- }
- }
- first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
- /* Reset from possible previous state. */
- platform_ids->resize(0);
- cl_uint num_platforms;
- if (!get_num_platforms(&num_platforms, error)) {
- return false;
- }
- /* Get actual platforms. */
- cl_int err;
- platform_ids->resize(num_platforms);
- if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
- vector<cl_platform_id> platform_ids;
- get_platforms(&platform_ids);
- return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
- cl_int err;
- if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *num_platforms = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
- cl_uint num_platforms;
- if (!get_num_platforms(&num_platforms)) {
- return 0;
- }
- return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
- char buffer[256];
- if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
- CL_SUCCESS) {
- *platform_name = "";
- return false;
- }
- *platform_name = buffer;
- return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
- string platform_name;
- if (!get_platform_name(platform_id, &platform_name)) {
- return "";
- }
- return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- cl_uint *num_devices,
- cl_int *error)
-{
- cl_int err;
- if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *num_devices = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type)
-{
- cl_uint num_devices;
- if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
- return 0;
- }
- return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- vector<cl_device_id> *device_ids,
- cl_int *error)
-{
- /* Reset from possible previous state. */
- device_ids->resize(0);
- /* Get number of devices to pre-allocate memory. */
- cl_uint num_devices;
- if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
- return false;
- }
- /* Get actual device list. */
- device_ids->resize(num_devices);
- cl_int err;
- if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type)
-{
- vector<cl_device_id> devices;
- get_platform_devices(platform_id, device_type, &devices);
- return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
- char buffer[1024];
- cl_int err;
- if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_name = "";
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_name = buffer;
- return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
- string device_name;
- if (!get_device_name(device_id, &device_name)) {
- return "";
- }
- return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int *error)
-{
- size_t extension_length = 0;
- cl_int err;
- /* Determine the size of the extension string. */
- if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_extensions = "";
- return false;
- }
- vector<char> buffer(extension_length);
- if ((err = clGetDeviceInfo(
- device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_extensions = "";
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_extensions = string(buffer.data());
- return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
- string device_extensions;
- if (!get_device_extensions(device_id, &device_extensions)) {
- return "";
- }
- return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
- cl_device_type *device_type,
- cl_int *error)
-{
- cl_int err;
- if ((err = clGetDeviceInfo(
- device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- *device_type = 0;
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
- cl_device_type device_type;
- if (!get_device_type(device_id, &device_type)) {
- return 0;
- }
- return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
- string name = "";
- char board_name[1024];
- size_t length = 0;
- if (clGetDeviceInfo(
- device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
- CL_SUCCESS) {
- if (length != 0 && board_name[0] != '\0') {
- name = board_name;
- }
- }
-
- /* Fallback to standard device name API. */
- if (name.empty()) {
- name = get_device_name(device_id);
- }
-
- /* Special exception for AMD Vega, need to be able to tell
- * Vega 56 from 64 apart.
- */
- if (name == "Radeon RX Vega") {
- cl_int max_compute_units = 0;
- if (clGetDeviceInfo(device_id,
- CL_DEVICE_MAX_COMPUTE_UNITS,
- sizeof(max_compute_units),
- &max_compute_units,
- NULL) == CL_SUCCESS) {
- name += " " + to_string(max_compute_units);
- }
- }
-
- /* Distinguish from our native CPU device. */
- if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
- name += " (OpenCL)";
- }
-
- return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
- char buffer[1024];
- cl_int err;
- if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
- CL_SUCCESS) {
- if (error != NULL) {
- *error = err;
- }
- return false;
- }
- if (error != NULL) {
- *error = CL_SUCCESS;
- }
- if (sscanf(buffer, "%d.%d", major, minor) < 2) {
- VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
- return false;
- }
- return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
- int base_align_bits;
- if (clGetDeviceInfo(
- device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
- CL_SUCCESS) {
- return base_align_bits / 8;
- }
- return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+# include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+ if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+ /* Already initialized function table. */
+ return true;
+ }
+
+ /* Need to initialize CUDA as well. */
+ if (!device_cuda_init()) {
+ return false;
+ }
+
+ const OptixResult result = optixInit();
+
+ if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+ VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+ "Please update to the latest driver first!";
+ return false;
+ }
+ else if (result != OPTIX_SUCCESS) {
+ VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+ return false;
+ }
+
+ /* Loaded OptiX successfully! */
+ return true;
+#else
+ return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+ devices.reserve(cuda_devices.size());
+
+ /* Simply add all supported CUDA devices as OptiX devices again. */
+ for (DeviceInfo info : cuda_devices) {
+ assert(info.type == DEVICE_CUDA);
+
+ int major;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+ if (major < 5) {
+ /* Only Maxwell and up are supported by OptiX. */
+ continue;
+ }
+
+ info.type = DEVICE_OPTIX;
+ info.id += "_OptiX";
+ info.denoisers |= DENOISER_OPTIX;
+
+ devices.push_back(info);
+ }
+#else
+ (void)cuda_devices;
+ (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+ return new OptiXDevice(info, stats, profiler);
+#else
+ (void)info;
+ (void)stats;
+ (void)profiler;
+
+ LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+ return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..b54d423a183
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/optix/device_impl.h"
+
+# include "bvh/bvh.h"
+# include "bvh/bvh_optix.h"
+# include "integrator/pass_accessor_gpu.h"
+# include "render/buffers.h"
+# include "render/hair.h"
+# include "render/mesh.h"
+# include "render/object.h"
+# include "render/pass.h"
+# include "render/scene.h"
+
+# include "util/util_debug.h"
+# include "util/util_logging.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_progress.h"
+# include "util/util_time.h"
+
+# undef __KERNEL_CPU__
+# define __KERNEL_OPTIX__
+# include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+ : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+ const CUDAContextScope scope(device);
+ if (optix_denoiser != nullptr) {
+ optixDenoiserDestroy(optix_denoiser);
+ }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+ : CUDADevice(info, stats, profiler),
+ sbt_data(this, "__sbt", MEM_READ_ONLY),
+ launch_params(this, "__params"),
+ denoiser_(this)
+{
+ /* Make the CUDA context current. */
+ if (!cuContext) {
+ /* Do not initialize if CUDA context creation failed already. */
+ return;
+ }
+ const CUDAContextScope scope(this);
+
+ /* Create OptiX context for this device. */
+ OptixDeviceContextOptions options = {};
+# ifdef WITH_CYCLES_LOGGING
+ options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+ options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+ switch (level) {
+ case 1:
+ LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+ break;
+ case 2:
+ LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+ break;
+ case 3:
+ LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+ break;
+ case 4:
+ LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+ break;
+ }
+ };
+# endif
+ if (DebugFlags().optix.use_debug) {
+ options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+ }
+ optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+# ifdef WITH_CYCLES_LOGGING
+ optix_assert(optixDeviceContextSetLogCallback(
+ context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+# endif
+
+ /* Fix weird compiler bug that assigns wrong size. */
+ launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+ /* Allocate launch parameter buffer memory on device. */
+ launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+ /* Make CUDA context current. */
+ const CUDAContextScope scope(this);
+
+ free_bvh_memory_delayed();
+
+ sbt_data.free();
+ texture_info.free();
+ launch_params.free();
+
+ /* Unload modules. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ }
+ }
+
+ optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+ return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+ /* OptiX has its own internal acceleration structure format. */
+ return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+ string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+ /* Add OptiX SDK include directory to include paths. */
+ const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+ if (optix_sdk_path) {
+ common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+ }
+
+ /* Specialization for shader raytracing. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ common_cflags += " --keep-device-functions";
+ }
+
+ return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+ if (have_error()) {
+ /* Abort early if context creation failed already. */
+ return false;
+ }
+
+ /* Load CUDA modules because we need some of the utility kernels. */
+ if (!CUDADevice::load_kernels(kernel_features)) {
+ return false;
+ }
+
+ /* Skip creating OptiX module if only doing denoising. */
+ if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+ return true;
+ }
+
+ const CUDAContextScope scope(this);
+
+ /* Unload existing OptiX module and pipelines first. */
+ if (optix_module != NULL) {
+ optixModuleDestroy(optix_module);
+ optix_module = NULL;
+ }
+ for (unsigned int i = 0; i < 2; ++i) {
+ if (builtin_modules[i] != NULL) {
+ optixModuleDestroy(builtin_modules[i]);
+ builtin_modules[i] = NULL;
+ }
+ }
+ for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+ if (pipelines[i] != NULL) {
+ optixPipelineDestroy(pipelines[i]);
+ pipelines[i] = NULL;
+ }
+ }
+
+ OptixModuleCompileOptions module_options = {};
+ module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+ if (DebugFlags().optix.use_debug) {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+ module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ module_options.boundValues = nullptr;
+ module_options.numBoundValues = 0;
+
+ OptixPipelineCompileOptions pipeline_options = {};
+ /* Default to no motion blur and two-level graph, since it is the fastest option. */
+ pipeline_options.usesMotionBlur = false;
+ pipeline_options.traversableGraphFlags =
+ OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+ pipeline_options.numPayloadValues = 6;
+ pipeline_options.numAttributeValues = 2; /* u, v */
+ pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+ pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+ pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+ }
+ else
+ pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+ }
+
+ /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+ * This is necessary since objects may be reported to have motion if the Vector pass is
+ * active, but may still need to be rendered without motion blur if that isn't active as well. */
+ motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+ if (motion_blur) {
+ pipeline_options.usesMotionBlur = true;
+ /* Motion blur can insert motion transforms into the traversal graph.
+ * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+ pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+ }
+
+ { /* Load and compile PTX module with OptiX kernels. */
+ string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+ "lib/kernel_optix_shader_raytrace.ptx" :
+ "lib/kernel_optix.ptx");
+ if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+ if (!getenv("OPTIX_ROOT_DIR")) {
+ set_error(
+ "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+ "the Optix SDK to be able to compile Optix kernels on demand).");
+ return false;
+ }
+ ptx_filename = compile_kernel(
+ kernel_features,
+ (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+ "optix",
+ true);
+ }
+ if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+ return false;
+ }
+
+ const OptixResult result = optixModuleCreateFromPTX(context,
+ &module_options,
+ &pipeline_options,
+ ptx_data.data(),
+ ptx_data.size(),
+ nullptr,
+ 0,
+ &optix_module);
+ if (result != OPTIX_SUCCESS) {
+ set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+ ptx_filename.c_str(),
+ optixGetErrorName(result)));
+ return false;
+ }
+ }
+
+ /* Create program groups. */
+ OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+ OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_closest";
+ group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_shadow";
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_subsurface";
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+ group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_intersect_volume_stack";
+ group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+ group_descs[PG_MISS].miss.module = optix_module;
+ group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+ group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+ group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+ group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+ if (kernel_features & KERNEL_FEATURE_HAIR) {
+ if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ /* Built-in thick curve intersection. */
+ OptixBuiltinISOptions builtin_options = {};
+ builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ builtin_options.usesMotionBlur = false;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+ group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+ group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+ if (motion_blur) {
+ builtin_options.usesMotionBlur = true;
+
+ optix_assert(optixBuiltinISModuleGet(
+ context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+ group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+ group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+ group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+ }
+ }
+ else {
+ /* Custom ribbon intersection. */
+ group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+ group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+ }
+ }
+
+ if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+ /* Add hit group for local intersections. */
+ group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+ group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+ group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+ }
+
+ /* Shader raytracing replaces some functions with direct callables. */
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+ group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+ "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+ group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+ group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+ "__direct_callable__svm_node_bevel";
+ group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+ group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+ group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+ }
+
+ optix_assert(optixProgramGroupCreate(
+ context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+ /* Get program stack sizes. */
+ OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+ /* Set up SBT, which in this case is used only to select between different programs. */
+ sbt_data.alloc(NUM_PROGRAM_GROUPS);
+ memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+ optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+ }
+ sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+ /* Calculate maximum trace continuation stack size. */
+ unsigned int trace_css = stack_size[PG_HITD].cssCH;
+ /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+ trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+ trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+ trace_css = std::max(trace_css,
+ stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+ OptixPipelineLinkOptions link_options = {};
+ link_options.maxTraceDepth = 1;
+
+ if (DebugFlags().optix.use_debug) {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+ }
+ else {
+ link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+ }
+
+ if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+ /* Create shader raytracing pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+ pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+ pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_SHADE_RAYTRACE]));
+
+ /* Combine ray generation and trace continuation stack size. */
+ const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+ link_options.maxTraceDepth * trace_css;
+ const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+ stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+ /* Set stack size depending on pipeline options. */
+ optix_assert(optixPipelineSetStackSize(
+ pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+ }
+
+ { /* Create intersection-only pipeline. */
+ vector<OptixProgramGroup> pipeline_groups;
+ pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+ pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+ pipeline_groups.push_back(groups[PG_MISS]);
+ pipeline_groups.push_back(groups[PG_HITD]);
+ pipeline_groups.push_back(groups[PG_HITS]);
+ pipeline_groups.push_back(groups[PG_HITL]);
+ if (motion_blur) {
+ pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+ pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+ }
+
+ optix_assert(optixPipelineCreate(context,
+ &pipeline_options,
+ &link_options,
+ pipeline_groups.data(),
+ pipeline_groups.size(),
+ nullptr,
+ 0,
+ &pipelines[PIP_INTERSECT]));
+
+ /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+ const unsigned int css =
+ std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+ std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+ stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+ link_options.maxTraceDepth * trace_css;
+
+ optix_assert(
+ optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+ }
+
+ /* Clean up program group objects. */
+ for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+ optixProgramGroupDestroy(groups[i]);
+ }
+
+ return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+ explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+ : denoise_params(task.params),
+ render_buffers(task.render_buffers),
+ buffer_params(task.buffer_params),
+ guiding_buffer(device, "denoiser guiding passes buffer"),
+ num_samples(task.num_samples)
+ {
+ num_input_passes = 1;
+ if (denoise_params.use_pass_albedo) {
+ num_input_passes += 1;
+ use_pass_albedo = true;
+ pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+ if (denoise_params.use_pass_normal) {
+ num_input_passes += 1;
+ use_pass_normal = true;
+ pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+ }
+ }
+
+ const int num_guiding_passes = num_input_passes - 1;
+
+ if (num_guiding_passes) {
+ if (task.allow_inplace_modification) {
+ guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+ guiding_params.pass_albedo = pass_denoising_albedo;
+ guiding_params.pass_normal = pass_denoising_normal;
+
+ guiding_params.stride = buffer_params.stride;
+ guiding_params.pass_stride = buffer_params.pass_stride;
+ }
+ else {
+ guiding_params.pass_stride = 0;
+ if (use_pass_albedo) {
+ guiding_params.pass_albedo = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+ if (use_pass_normal) {
+ guiding_params.pass_normal = guiding_params.pass_stride;
+ guiding_params.pass_stride += 3;
+ }
+
+ guiding_params.stride = buffer_params.width;
+
+ guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+ guiding_params.pass_stride);
+ guiding_params.device_pointer = guiding_buffer.device_pointer;
+ }
+ }
+
+ pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+ }
+
+ const DenoiseParams &denoise_params;
+
+ RenderBuffers *render_buffers = nullptr;
+ const BufferParams &buffer_params;
+
+ /* Device-side storage of the guiding passes. */
+ device_only_memory<float> guiding_buffer;
+
+ struct {
+ device_ptr device_pointer = 0;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_albedo = PASS_UNUSED;
+ int pass_normal = PASS_UNUSED;
+
+ int stride = -1;
+ int pass_stride = -1;
+ } guiding_params;
+
+ /* Number of input passes. Including the color and extra auxiliary passes. */
+ int num_input_passes = 0;
+ bool use_pass_albedo = false;
+ bool use_pass_normal = false;
+
+ int num_samples = 0;
+
+ int pass_sample_count = PASS_UNUSED;
+
+ /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+ int pass_denoising_albedo = PASS_UNUSED;
+ int pass_denoising_normal = PASS_UNUSED;
+
+ /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+ * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+ * the fake values and denoising of passes which do need albedo can no longer happen. */
+ bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+ DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+ {
+ noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+ denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+ const PassInfo pass_info = Pass::get_info(type);
+ num_components = pass_info.num_components;
+ use_compositing = pass_info.use_compositing;
+ use_denoising_albedo = pass_info.use_denoising_albedo;
+ }
+
+ PassType type;
+
+ int noisy_offset;
+ int denoised_offset;
+
+ int num_components;
+ bool use_compositing;
+ bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+ const CUDAContextScope scope(this);
+
+ DenoiseContext context(this, task);
+
+ if (!denoise_ensure(context)) {
+ return false;
+ }
+
+ if (!denoise_filter_guiding_preprocess(context)) {
+ LOG(ERROR) << "Error preprocessing guiding passes.";
+ return false;
+ }
+
+ /* Passes which will use real albedo when it is available. */
+ denoise_pass(context, PASS_COMBINED);
+ denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+ /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+ denoise_pass(context, PASS_SHADOW_CATCHER);
+
+ return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+ return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&context.guiding_params.pass_normal),
+ &context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&context.pass_denoising_albedo),
+ const_cast<int *>(&context.pass_denoising_normal),
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&context.num_samples)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+ const_cast<int *>(&context.guiding_params.pass_stride),
+ const_cast<int *>(&context.guiding_params.pass_albedo),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const DenoisePass pass(pass_type, buffer_params);
+
+ if (pass.noisy_offset == PASS_UNUSED) {
+ return;
+ }
+ if (pass.denoised_offset == PASS_UNUSED) {
+ LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+ return;
+ }
+
+ if (pass.use_denoising_albedo) {
+ if (context.albedo_replaced_with_fake) {
+ LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+ return;
+ }
+ }
+ else if (!context.albedo_replaced_with_fake) {
+ context.albedo_replaced_with_fake = true;
+ if (!denoise_filter_guiding_set_fake_albedo(context)) {
+ LOG(ERROR) << "Error replacing real albedo with the fake one.";
+ return;
+ }
+ }
+
+ /* Read and preprocess noisy color input pass. */
+ denoise_color_read(context, pass);
+ if (!denoise_filter_color_preprocess(context, pass)) {
+ LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+ return;
+ }
+
+ if (!denoise_run(context, pass)) {
+ LOG(ERROR) << "Error running OptiX denoiser.";
+ return;
+ }
+
+ /* Store result in the combined pass of the render buffer.
+ *
+ * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+ if (!denoise_filter_color_postprocess(context, pass)) {
+ LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+ return;
+ }
+
+ denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = pass.type;
+ pass_access_info.mode = PassMode::NOISY;
+ pass_access_info.offset = pass.noisy_offset;
+
+ /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+ * on the approximation. The latter is not even possible because OptiX does not support
+ * denoising of semi-transparent pixels. */
+ pass_access_info.use_approximate_shadow_catcher = false;
+ pass_access_info.use_approximate_shadow_catcher_background = false;
+ pass_access_info.show_active_pixels = false;
+
+ /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+ */
+ const PassAccessorGPU pass_accessor(
+ &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+ PassAccessor::Destination destination(pass_access_info.type);
+ destination.d_pixels = context.render_buffers->buffer.device_pointer +
+ pass.denoised_offset * sizeof(float);
+ destination.num_components = 3;
+ destination.pixel_stride = context.buffer_params.pass_stride;
+
+ pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&pass.denoised_offset)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+ const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ void *args[] = {&context.render_buffers->buffer.device_pointer,
+ const_cast<int *>(&buffer_params.full_x),
+ const_cast<int *>(&buffer_params.full_y),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.height),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&buffer_params.pass_stride),
+ const_cast<int *>(&context.num_samples),
+ const_cast<int *>(&pass.noisy_offset),
+ const_cast<int *>(&pass.denoised_offset),
+ const_cast<int *>(&context.pass_sample_count),
+ const_cast<int *>(&pass.num_components),
+ const_cast<bool *>(&pass.use_compositing)};
+
+ return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+ if (!denoise_create_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser creation has failed.";
+ return false;
+ }
+
+ if (!denoise_configure_if_needed(context)) {
+ LOG(ERROR) << "OptiX denoiser configuration has failed.";
+ return false;
+ }
+
+ return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+ const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+ (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+ (denoiser_.use_pass_normal != context.use_pass_normal);
+ if (!recreate_denoiser) {
+ return true;
+ }
+
+ /* Destroy existing handle before creating new one. */
+ if (denoiser_.optix_denoiser) {
+ optixDenoiserDestroy(denoiser_.optix_denoiser);
+ }
+
+ /* Create OptiX denoiser handle on demand when it is first used. */
+ OptixDenoiserOptions denoiser_options = {};
+ denoiser_options.guideAlbedo = context.use_pass_albedo;
+ denoiser_options.guideNormal = context.use_pass_normal;
+ const OptixResult result = optixDenoiserCreate(
+ this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to create OptiX denoiser");
+ return false;
+ }
+
+ /* OptiX denoiser handle was created with the requested number of input passes. */
+ denoiser_.use_pass_albedo = context.use_pass_albedo;
+ denoiser_.use_pass_normal = context.use_pass_normal;
+
+ /* OptiX denoiser has been created, but it needs configuration. */
+ denoiser_.is_configured = false;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+ if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+ denoiser_.configured_size.y == context.buffer_params.height)) {
+ return true;
+ }
+
+ const BufferParams &buffer_params = context.buffer_params;
+
+ OptixDenoiserSizes sizes = {};
+ optix_assert(optixDenoiserComputeMemoryResources(
+ denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+ denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+ denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+ /* Allocate denoiser state if tile size has changed since last setup. */
+ denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+ /* Initialize denoiser state for the current tile size. */
+ const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ buffer_params.width,
+ buffer_params.height,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ denoiser_.state.device_pointer +
+ denoiser_.scratch_offset,
+ denoiser_.scratch_size);
+ if (result != OPTIX_SUCCESS) {
+ set_error("Failed to set up OptiX denoiser");
+ return false;
+ }
+
+ denoiser_.is_configured = true;
+ denoiser_.configured_size.x = buffer_params.width;
+ denoiser_.configured_size.y = buffer_params.height;
+
+ return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+ const BufferParams &buffer_params = context.buffer_params;
+ const int width = buffer_params.width;
+ const int height = buffer_params.height;
+
+ /* Set up input and output layer information. */
+ OptixImage2D color_layer = {0};
+ OptixImage2D albedo_layer = {0};
+ OptixImage2D normal_layer = {0};
+
+ OptixImage2D output_layer = {0};
+
+ /* Color pass. */
+ {
+ const int pass_denoised = pass.denoised_offset;
+ const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+ color_layer.data = context.render_buffers->buffer.device_pointer +
+ pass_denoised * sizeof(float);
+ color_layer.width = width;
+ color_layer.height = height;
+ color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+ color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+ color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+ /* Optional albedo and color passes. */
+ if (context.num_input_passes > 1) {
+ const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+ const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+ const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+ if (context.use_pass_albedo) {
+ albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+ albedo_layer.width = width;
+ albedo_layer.height = height;
+ albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+ albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+
+ if (context.use_pass_normal) {
+ normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+ normal_layer.width = width;
+ normal_layer.height = height;
+ normal_layer.rowStrideInBytes = row_stride_in_bytes;
+ normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+ normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+ }
+ }
+
+ /* Denoise in-place of the noisy input in the render buffers. */
+ output_layer = color_layer;
+
+ /* Finally run denoising. */
+ OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+ OptixDenoiserLayer image_layers = {};
+ image_layers.input = color_layer;
+ image_layers.output = output_layer;
+
+ OptixDenoiserGuideLayer guide_layers = {};
+ guide_layers.albedo = albedo_layer;
+ guide_layers.normal = normal_layer;
+
+ optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+ denoiser_.queue.stream(),
+ &params,
+ denoiser_.state.device_pointer,
+ denoiser_.scratch_offset,
+ &guide_layers,
+ &image_layers,
+ 1,
+ 0,
+ 0,
+ denoiser_.state.device_pointer + denoiser_.scratch_offset,
+ denoiser_.scratch_size));
+
+ return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps)
+{
+ const CUDAContextScope scope(this);
+
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ /* Compute memory usage. */
+ OptixAccelBufferSizes sizes = {};
+ OptixAccelBuildOptions options = {};
+ options.operation = operation;
+ if (use_fast_trace_bvh) {
+ VLOG(2) << "Using fast to trace OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+ }
+ else {
+ VLOG(2) << "Using fast to update OptiX BVH";
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+ }
+
+ options.motionOptions.numKeys = num_motion_steps;
+ options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+ options.motionOptions.timeBegin = 0.0f;
+ options.motionOptions.timeEnd = 1.0f;
+
+ optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+ /* Allocate required output buffers. */
+ device_only_memory<char> temp_mem(this, "optix temp as build mem");
+ temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+ if (!temp_mem.device_pointer) {
+ /* Make sure temporary memory allocation succeeded. */
+ return false;
+ }
+
+ device_only_memory<char> &out_data = bvh->as_data;
+ if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+ assert(out_data.device == this);
+ out_data.alloc_to_device(sizes.outputSizeInBytes);
+ if (!out_data.device_pointer) {
+ return false;
+ }
+ }
+ else {
+ assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+ }
+
+ /* Finally build the acceleration structure. */
+ OptixAccelEmitDesc compacted_size_prop = {};
+ compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+ /* A tiny space was allocated for this property at the end of the temporary buffer above.
+ * Make sure this pointer is 8-byte aligned. */
+ compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+ OptixTraversableHandle out_handle = 0;
+ optix_assert(optixAccelBuild(context,
+ NULL,
+ &options,
+ &build_input,
+ 1,
+ temp_mem.device_pointer,
+ sizes.tempSizeInBytes,
+ out_data.device_pointer,
+ sizes.outputSizeInBytes,
+ &out_handle,
+ use_fast_trace_bvh ? &compacted_size_prop : NULL,
+ use_fast_trace_bvh ? 1 : 0));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for all operations to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+ */
+ if (use_fast_trace_bvh) {
+ uint64_t compacted_size = sizes.outputSizeInBytes;
+ cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+ /* Temporary memory is no longer needed, so free it now to make space. */
+ temp_mem.free();
+
+ /* There is no point compacting if the size does not change. */
+ if (compacted_size < sizes.outputSizeInBytes) {
+ device_only_memory<char> compacted_data(this, "optix compacted as");
+ compacted_data.alloc_to_device(compacted_size);
+ if (!compacted_data.device_pointer)
+ /* Do not compact if memory allocation for compacted acceleration structure fails.
+ * Can just use the uncompacted one then, so succeed here regardless. */
+ return !have_error();
+
+ optix_assert(optixAccelCompact(
+ context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+ bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+ /* Wait for compaction to finish. */
+ cuda_assert(cuStreamSynchronize(NULL));
+
+ std::swap(out_data.device_size, compacted_data.device_size);
+ std::swap(out_data.device_pointer, compacted_data.device_pointer);
+ }
+ }
+
+ return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+ const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+ free_bvh_memory_delayed();
+
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ progress.set_substatus("Building OptiX acceleration structure");
+
+ if (!bvh->params.top_level) {
+ assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+ /* Refit is only possible in viewport for now (because AS is built with
+ * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+ OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+ if (refit && !use_fast_trace_bvh) {
+ assert(bvh_optix->traversable_handle != 0);
+ operation = OPTIX_BUILD_OPERATION_UPDATE;
+ }
+ else {
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ }
+
+ /* Build bottom level acceleration structures (BLAS). */
+ Geometry *const geom = bvh->geometry[0];
+ if (geom->geometry_type == Geometry::HAIR) {
+ /* Build BLAS for curve primitives. */
+ Hair *const hair = static_cast<Hair *const>(geom);
+ if (hair->num_curves() == 0) {
+ return;
+ }
+
+ const size_t num_segments = hair->num_segments();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = hair->get_motion_steps();
+ }
+
+ device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ /* Four control points for each curve segment. */
+ const size_t num_vertices = num_segments * 4;
+ if (hair->curve_shape == CURVE_THICK) {
+ index_data.alloc(num_segments);
+ vertex_data.alloc(num_vertices * num_motion_steps);
+ }
+ else
+ aabb_data.alloc(num_segments * num_motion_steps);
+
+ /* Get AABBs for each motion step. */
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ /* The center step for motion vertices is not stored in the attribute. */
+ const float3 *keys = hair->get_curve_keys().data();
+ size_t center_step = (num_motion_steps - 1) / 2;
+ if (step != center_step) {
+ size_t attr_offset = (step > center_step) ? step - 1 : step;
+ /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+ keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+ }
+
+ for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+ const Hair::Curve curve = hair->get_curve(j);
+ const array<float> &curve_radius = hair->get_curve_radius();
+
+ for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+ if (hair->curve_shape == CURVE_THICK) {
+ int k0 = curve.first_key + segment;
+ int k1 = k0 + 1;
+ int ka = max(k0 - 1, curve.first_key);
+ int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+ const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+ const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+ const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+ const float4 pw = make_float4(
+ curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+ /* Convert Catmull-Rom data to Bezier spline. */
+ static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+ static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+ static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+ static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+ index_data[i] = i * 4;
+ float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+ v[0] = make_float4(
+ dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+ v[1] = make_float4(
+ dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+ v[2] = make_float4(
+ dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+ v[3] = make_float4(
+ dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+ }
+ else {
+ BoundBox bounds = BoundBox::empty;
+ curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+ const size_t index = step * num_segments + i;
+ aabb_data[index].minX = bounds.min.x;
+ aabb_data[index].minY = bounds.min.y;
+ aabb_data[index].minZ = bounds.min.z;
+ aabb_data[index].maxX = bounds.max.x;
+ aabb_data[index].maxY = bounds.max.y;
+ aabb_data[index].maxZ = bounds.max.z;
+ }
+ }
+ }
+ }
+
+ /* Upload AABB data to GPU. */
+ aabb_data.copy_to_device();
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> aabb_ptrs;
+ aabb_ptrs.reserve(num_motion_steps);
+ vector<device_ptr> width_ptrs;
+ vector<device_ptr> vertex_ptrs;
+ width_ptrs.reserve(num_motion_steps);
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+ const device_ptr base_ptr = vertex_data.device_pointer +
+ step * num_vertices * sizeof(float4);
+ width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+ vertex_ptrs.push_back(base_ptr);
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ if (hair->curve_shape == CURVE_THICK) {
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+ build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+ build_input.curveArray.numPrimitives = num_segments;
+ build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.curveArray.numVertices = num_vertices;
+ build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+ build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+ build_input.curveArray.widthStrideInBytes = sizeof(float4);
+ build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+ build_input.curveArray.indexStrideInBytes = sizeof(int);
+ build_input.curveArray.flag = build_flags;
+ build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+ else {
+ /* Disable visibility test any-hit program, since it is already checked during
+ * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+ build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+ build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+ build_input.customPrimitiveArray.numPrimitives = num_segments;
+ build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+ build_input.customPrimitiveArray.flags = &build_flags;
+ build_input.customPrimitiveArray.numSbtRecords = 1;
+ build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+ }
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+ /* Build BLAS for triangle primitives. */
+ Mesh *const mesh = static_cast<Mesh *const>(geom);
+ if (mesh->num_triangles() == 0) {
+ return;
+ }
+
+ const size_t num_verts = mesh->get_verts().size();
+
+ size_t num_motion_steps = 1;
+ Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+ if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+ num_motion_steps = mesh->get_motion_steps();
+ }
+
+ device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+ index_data.alloc(mesh->get_triangles().size());
+ memcpy(index_data.data(),
+ mesh->get_triangles().data(),
+ mesh->get_triangles().size() * sizeof(int));
+ device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+ vertex_data.alloc(num_verts * num_motion_steps);
+
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ const float3 *verts = mesh->get_verts().data();
+
+ size_t center_step = (num_motion_steps - 1) / 2;
+ /* The center step for motion vertices is not stored in the attribute. */
+ if (step != center_step) {
+ verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+ }
+
+ memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+ }
+
+ /* Upload triangle data to GPU. */
+ index_data.copy_to_device();
+ vertex_data.copy_to_device();
+
+ vector<device_ptr> vertex_ptrs;
+ vertex_ptrs.reserve(num_motion_steps);
+ for (size_t step = 0; step < num_motion_steps; ++step) {
+ vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+ }
+
+ /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+ unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+ build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+ build_input.triangleArray.numVertices = num_verts;
+ build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+ build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+ build_input.triangleArray.indexBuffer = index_data.device_pointer;
+ build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+ build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+ build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+ build_input.triangleArray.flags = &build_flags;
+ /* The SBT does not store per primitive data since Cycles already allocates separate
+ * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+ * one and rely on that having the same meaning in this case. */
+ build_input.triangleArray.numSbtRecords = 1;
+ build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+ if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ }
+ }
+ else {
+ unsigned int num_instances = 0;
+ unsigned int max_num_instances = 0xFFFFFFFF;
+
+ bvh_optix->as_data.free();
+ bvh_optix->traversable_handle = 0;
+ bvh_optix->motion_transform_data.free();
+
+ optixDeviceContextGetProperty(context,
+ OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+ &max_num_instances,
+ sizeof(max_num_instances));
+ /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+ max_num_instances >>= 1;
+ if (bvh->objects.size() > max_num_instances) {
+ progress.set_error(
+ "Failed to build OptiX acceleration structure because there are too many instances");
+ return;
+ }
+
+ /* Fill instance descriptions. */
+ device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+ instances.alloc(bvh->objects.size());
+
+ /* Calculate total motion transform size and allocate memory for them. */
+ size_t motion_transform_offset = 0;
+ if (motion_blur) {
+ size_t total_motion_transform_size = 0;
+ for (Object *const ob : bvh->objects) {
+ if (ob->is_traceable() && ob->use_motion()) {
+ total_motion_transform_size = align_up(total_motion_transform_size,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ total_motion_transform_size = total_motion_transform_size +
+ sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+ }
+ }
+
+ assert(bvh_optix->motion_transform_data.device == this);
+ bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+ }
+
+ for (Object *ob : bvh->objects) {
+ /* Skip non-traceable objects. */
+ if (!ob->is_traceable()) {
+ continue;
+ }
+
+ BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+ OptixTraversableHandle handle = blas->traversable_handle;
+
+ OptixInstance &instance = instances[num_instances++];
+ memset(&instance, 0, sizeof(instance));
+
+ /* Clear transform to identity matrix. */
+ instance.transform[0] = 1.0f;
+ instance.transform[5] = 1.0f;
+ instance.transform[10] = 1.0f;
+
+ /* Set user instance ID to object index (but leave low bit blank). */
+ instance.instanceId = ob->get_device_index() << 1;
+
+ /* Have to have at least one bit in the mask, or else instance would always be culled. */
+ instance.visibilityMask = 1;
+
+ if (ob->get_geometry()->has_volume) {
+ /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+ */
+ instance.visibilityMask |= 2;
+ }
+
+ if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+ /* Same applies to curves (so they can be skipped in local trace calls). */
+ instance.visibilityMask |= 4;
+
+ if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+ static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+ /* Select between motion blur and non-motion blur built-in intersection module. */
+ instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+ }
+ }
+
+ /* Insert motion traversable if object has motion. */
+ if (motion_blur && ob->use_motion()) {
+ size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+ size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+
+ const CUDAContextScope scope(this);
+
+ motion_transform_offset = align_up(motion_transform_offset,
+ OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+ CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+ motion_transform_offset;
+ motion_transform_offset += motion_transform_size;
+
+ /* Allocate host side memory for motion transform and fill it with transform data. */
+ OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+ new uint8_t[motion_transform_size]);
+ motion_transform.child = handle;
+ motion_transform.motionOptions.numKeys = ob->get_motion().size();
+ motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+ motion_transform.motionOptions.timeBegin = 0.0f;
+ motion_transform.motionOptions.timeEnd = 1.0f;
+
+ OptixSRTData *const srt_data = motion_transform.srtData;
+ array<DecomposedTransform> decomp(ob->get_motion().size());
+ transform_motion_decompose(
+ decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+ for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+ /* Scale. */
+ srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+ srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+ srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+ /* Shear. */
+ srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+ srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+ srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+ assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+ assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+ assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+ /* Pivot point. */
+ srt_data[i].pvx = 0.0f;
+ srt_data[i].pvy = 0.0f;
+ srt_data[i].pvz = 0.0f;
+
+ /* Rotation. */
+ srt_data[i].qx = decomp[i].x.x;
+ srt_data[i].qy = decomp[i].x.y;
+ srt_data[i].qz = decomp[i].x.z;
+ srt_data[i].qw = decomp[i].x.w;
+
+ /* Translation. */
+ srt_data[i].tx = decomp[i].y.x;
+ srt_data[i].ty = decomp[i].y.y;
+ srt_data[i].tz = decomp[i].y.z;
+ }
+
+ /* Upload motion transform to GPU. */
+ cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+ delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+ /* Disable instance transform if object uses motion transform already. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+ /* Get traversable handle to motion transform. */
+ optixConvertPointerToTraversableHandle(context,
+ motion_transform_gpu,
+ OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+ &instance.traversableHandle);
+ }
+ else {
+ instance.traversableHandle = handle;
+
+ if (ob->get_geometry()->is_instanced()) {
+ /* Set transform matrix. */
+ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+ }
+ else {
+ /* Disable instance transform if geometry already has it applied to vertex data. */
+ instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+ /* Non-instanced objects read ID from 'prim_object', so distinguish
+ * them from instanced objects with the low bit set. */
+ instance.instanceId |= 1;
+ }
+ }
+ }
+
+ /* Upload instance descriptions. */
+ instances.resize(num_instances);
+ instances.copy_to_device();
+
+ /* Build top-level acceleration structure (TLAS) */
+ OptixBuildInput build_input = {};
+ build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+ build_input.instanceArray.instances = instances.device_pointer;
+ build_input.instanceArray.numInstances = num_instances;
+
+ if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+ progress.set_error("Failed to build OptiX acceleration structure");
+ }
+ tlas_handle = bvh_optix->traversable_handle;
+ }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+ * while GPU is still rendering. */
+ BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+ delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+ bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+ thread_scoped_lock lock(delayed_free_bvh_mutex);
+ delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+ /* Set constant memory for CUDA module. */
+ CUDADevice::const_copy_to(name, host, size);
+
+ if (strcmp(name, "__data") == 0) {
+ assert(size <= sizeof(KernelData));
+
+ /* Update traversable handle (since it is different for each device on multi devices). */
+ KernelData *const data = (KernelData *)host;
+ *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+ update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+ return;
+ }
+
+ /* Update data storage pointers in launch parameters. */
+# define KERNEL_TEX(data_type, tex_name) \
+ if (strcmp(name, #tex_name) == 0) { \
+ update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+ return; \
+ }
+ KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+# include "kernel/kernel_textures.h"
+# undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+ const CUDAContextScope scope(this);
+
+ cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..91ef52e0a5a
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/device_impl.h"
+# include "device/optix/queue.h"
+# include "device/optix/util.h"
+# include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+ PG_RGEN_INTERSECT_CLOSEST,
+ PG_RGEN_INTERSECT_SHADOW,
+ PG_RGEN_INTERSECT_SUBSURFACE,
+ PG_RGEN_INTERSECT_VOLUME_STACK,
+ PG_RGEN_SHADE_SURFACE_RAYTRACE,
+ PG_MISS,
+ PG_HITD, /* Default hit group. */
+ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+ PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+ PG_HITD_MOTION,
+ PG_HITS_MOTION,
+ PG_CALL_SVM_AO,
+ PG_CALL_SVM_BEVEL,
+ PG_CALL_AO_PASS,
+ NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+ char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+ OptixDeviceContext context = NULL;
+
+ OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+ OptixModule builtin_modules[2] = {};
+ OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+ bool motion_blur = false;
+ device_vector<SbtRecord> sbt_data;
+ device_only_memory<KernelParamsOptiX> launch_params;
+ OptixTraversableHandle tlas_handle = 0;
+
+ vector<device_only_memory<char>> delayed_free_bvh_memory;
+ thread_mutex delayed_free_bvh_mutex;
+
+ class Denoiser {
+ public:
+ explicit Denoiser(OptiXDevice *device);
+ ~Denoiser();
+
+ OptiXDevice *device;
+ OptiXDeviceQueue queue;
+
+ OptixDenoiser optix_denoiser = nullptr;
+
+ /* Configuration size, as provided to `optixDenoiserSetup`.
+ * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+ * `is_configured` will be false. */
+ bool is_configured = false;
+ int2 configured_size = make_int2(0, 0);
+
+ /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+ * The memory layout goes as following: [denoiser state][scratch buffer]. */
+ device_only_memory<unsigned char> state;
+ size_t scratch_offset = 0;
+ size_t scratch_size = 0;
+
+ bool use_pass_albedo = false;
+ bool use_pass_normal = false;
+ };
+ Denoiser denoiser_;
+
+ public:
+ OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+ ~OptiXDevice();
+
+ private:
+ BVHLayoutMask get_bvh_layout_mask() const override;
+
+ string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+ bool load_kernels(const uint kernel_features) override;
+
+ bool build_optix_bvh(BVHOptiX *bvh,
+ OptixBuildOperation operation,
+ const OptixBuildInput &build_input,
+ uint16_t num_motion_steps);
+
+ void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+ void release_optix_bvh(BVH *bvh) override;
+ void free_bvh_memory_delayed();
+
+ void const_copy_to(const char *name, void *host, size_t size) override;
+
+ void update_launch_params(size_t offset, void *data, size_t data_size);
+
+ virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+ /* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ class DenoiseContext;
+ class DenoisePass;
+
+ virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+ virtual DeviceQueue *get_denoise_queue() override;
+
+ /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+ * OptiX and store in the guiding passes memory within the given context.
+ *
+ * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
+ * preprocess them for every pass which is being denoised. */
+ bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+ /* Set fake albedo pixels in the albedo guiding pass storage.
+ * After this point only passes which do not need albedo for denoising can be processed. */
+ bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+ void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+ /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+ * input within the given context. Pixels are scaled to the number of samples, but are not
+ * preprocessed yet. */
+ void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+ /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+ * denoiser result to the render buffer. */
+ bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+ bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+ /* Make sure the OptiX denoiser is created and configured. */
+ bool denoise_ensure(DenoiseContext &context);
+
+ /* Create OptiX denoiser descriptor if needed.
+ * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+ * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+ bool denoise_create_if_needed(DenoiseContext &context);
+
+ /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+ bool denoise_configure_if_needed(DenoiseContext &context);
+
+ /* Run configured denoiser. */
+ bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+# include "device/optix/queue.h"
+# include "device/optix/device_impl.h"
+
+# include "util/util_time.h"
+
+# undef __KERNEL_CPU__
+# define __KERNEL_OPTIX__
+# include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+ CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+ return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+ if (!is_optix_specific_kernel(kernel)) {
+ return CUDADeviceQueue::enqueue(kernel, work_size, args);
+ }
+
+ if (cuda_device_->have_error()) {
+ return false;
+ }
+
+ debug_enqueue(kernel, work_size);
+
+ const CUDAContextScope scope(cuda_device_);
+
+ OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+ const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+ const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+ args[0], // &d_path_index
+ sizeof(device_ptr),
+ cuda_stream_));
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+ args[1], // &d_render_buffer
+ sizeof(device_ptr),
+ cuda_stream_));
+ }
+
+ cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+ OptixPipeline pipeline = nullptr;
+ OptixShaderBindingTable sbt_params = {};
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ pipeline = optix_device->pipelines[PIP_INTERSECT];
+ sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+ break;
+
+ default:
+ LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+ << " is attempted to be enqueued.";
+ return false;
+ }
+
+ sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+ sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+ sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+ sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+ sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+ sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+ sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+ sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+ /* Launch the ray generation program. */
+ optix_device_assert(optix_device,
+ optixLaunch(pipeline,
+ cuda_stream_,
+ launch_params_ptr,
+ optix_device->launch_params.data_elements,
+ &sbt_params,
+ work_size,
+ 1,
+ 1));
+
+ return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+ OptiXDeviceQueue(OptiXDevice *device);
+
+ virtual void init_execution() override;
+
+ virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+# include "device/cuda/util.h"
+
+# ifdef WITH_CUDA_DYNLOAD
+# include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+# define OPTIX_DONT_INCLUDE_CUDA
+# endif
+
+# include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+# define optix_device_assert(optix_device, stmt) \
+ { \
+ OptixResult result = stmt; \
+ if (result != OPTIX_SUCCESS) { \
+ const char *name = optixGetErrorName(result); \
+ optix_device->set_error( \
+ string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+ } \
+ } \
+ (void)0
+
+# define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 57f25283f85..8294e716ebe 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -814,7 +814,7 @@ bool Node::socket_is_modified(const SocketType &input) const
return (socket_modified & input.modified_flag_bit) != 0;
}
-bool Node::is_modified()
+bool Node::is_modified() const
{
return socket_modified != 0;
}
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index aa365baeccd..8f27a82d37b 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,6 +16,8 @@
#pragma once
+#include <type_traits>
+
#include "graph/node_type.h"
#include "util/util_array.h"
@@ -34,7 +36,10 @@ struct Transform;
#define NODE_SOCKET_API_BASE_METHODS(type_, name, string_name) \
const SocketType *get_##name##_socket() const \
{ \
- static const SocketType *socket = type->find_input(ustring(string_name)); \
+ /* Explicitly cast to base class to use `Node::type` even if the derived class defines \
+ * `type`. */ \
+ const Node *self_node = this; \
+ static const SocketType *socket = self_node->type->find_input(ustring(string_name)); \
return socket; \
} \
bool name##_is_modified() const \
@@ -111,6 +116,15 @@ struct Node {
void set(const SocketType &input, const Transform &value);
void set(const SocketType &input, Node *value);
+ /* Implicitly cast enums and enum classes to integer, which matches an internal way of how
+ * enumerator values are stored and accessed in a generic API. */
+ template<class ValueType, typename std::enable_if_t<std::is_enum_v<ValueType>> * = nullptr>
+ void set(const SocketType &input, const ValueType &value)
+ {
+ static_assert(sizeof(ValueType) <= sizeof(int), "Enumerator type should fit int");
+ set(input, static_cast<int>(value));
+ }
+
/* set array values. the memory from the input array will taken over
* by the node and the input array will be empty after return */
void set(const SocketType &input, array<bool> &value);
@@ -164,7 +178,7 @@ struct Node {
bool socket_is_modified(const SocketType &input) const;
- bool is_modified();
+ bool is_modified() const;
void tag_modified();
void clear_modified();
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+ ..
+)
+
+set(SRC
+ adaptive_sampling.cpp
+ denoiser.cpp
+ denoiser_device.cpp
+ denoiser_oidn.cpp
+ denoiser_optix.cpp
+ path_trace.cpp
+ tile.cpp
+ pass_accessor.cpp
+ pass_accessor_cpu.cpp
+ pass_accessor_gpu.cpp
+ path_trace_work.cpp
+ path_trace_work_cpu.cpp
+ path_trace_work_gpu.cpp
+ render_scheduler.cpp
+ shader_eval.cpp
+ work_balancer.cpp
+ work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+ adaptive_sampling.h
+ denoiser.h
+ denoiser_device.h
+ denoiser_oidn.h
+ denoiser_optix.h
+ path_trace.h
+ tile.h
+ pass_accessor.h
+ pass_accessor_cpu.h
+ pass_accessor_gpu.h
+ path_trace_work.h
+ path_trace_work_cpu.h
+ path_trace_work_gpu.h
+ render_scheduler.h
+ shader_eval.h
+ work_balancer.h
+ work_tile_scheduler.h
+)
+
+set(LIB
+ # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+ # avoid such cyclic dependency.
+ cycles_render
+
+ cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+ list(APPEND LIB
+ ${OPENIMAGEDENOISE_LIBRARIES}
+ )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+ if (!use) {
+ return num_samples;
+ }
+
+ /*
+ * The naive implementation goes as following:
+ *
+ * int count = 1;
+ * while (!need_filter(start_sample + count - 1) && count < num_samples) {
+ * ++count;
+ * }
+ * return count;
+ */
+
+ /* 0-based sample index at which first filtering will happen. */
+ const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+ /* Allow as many samples as possible until the first filter sample. */
+ if (start_sample + num_samples <= first_filter_sample) {
+ return num_samples;
+ }
+
+ const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+ const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+ return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+ if (!use) {
+ return false;
+ }
+
+ if (sample <= min_samples) {
+ return false;
+ }
+
+ return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+ AdaptiveSampling();
+
+ /* Align number of samples so that they align with the adaptive filtering.
+ *
+ * Returns the new value for the `num_samples` so that after rendering so many samples on top
+ * of `start_sample` filtering is required.
+ *
+ * The alignment happens in a way that allows to render as many samples as possible without
+ * missing any filtering point. This means that the result is "clamped" by the nearest sample
+ * at which filtering is needed. This is part of mechanism which ensures that all devices will
+ * perform same exact filtering and adaptive sampling, regardless of their performance.
+ *
+ * `start_sample` is the 0-based index of sample.
+ *
+ * NOTE: The start sample is included into the number of samples to render. This means that
+ * if the number of samples is 1, then the path tracer will render samples [align_samples],
+ * if the number of samples is 2, then the path tracer will render samples [align_samples,
+ * align_samples + 1] and so on. */
+ int align_samples(int start_sample, int num_samples) const;
+
+ /* Check whether adaptive sampling filter should happen at this sample.
+ * Returns false if the adaptive sampling is not use.
+ *
+ * `sample` is the 0-based index of sample. */
+ bool need_filter(int sample) const;
+
+ bool use = false;
+ int adaptive_step = 0;
+ int min_samples = 0;
+ float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+ DCHECK(params.use);
+
+ switch (params.type) {
+ case DENOISER_OPTIX:
+ return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+ case DENOISER_OPENIMAGEDENOISE:
+ return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+ case DENOISER_NUM:
+ case DENOISER_NONE:
+ case DENOISER_ALL:
+ /* pass */
+ break;
+ }
+
+ LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+ return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+ : path_trace_device_(path_trace_device), params_(params)
+{
+ DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+ DCHECK_EQ(params.type, params_.type);
+
+ if (params.type == params_.type) {
+ params_ = params;
+ }
+ else {
+ LOG(ERROR) << "Attempt to change denoiser type.";
+ }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+ return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+ const Device *denoiser_device = ensure_denoiser_device(progress);
+
+ if (!denoiser_device) {
+ path_trace_device_->set_error("No device available to denoise on");
+ return false;
+ }
+
+ VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+ << denoiser_device->info.id << ")";
+
+ return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+ return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+ if (device->info.type == DEVICE_MULTI) {
+ /* Assume multi-device is never created with a single sub-device.
+ * If one requests such configuration it should be checked on the session level. */
+ return false;
+ }
+
+ if (!device->info.multi_devices.empty()) {
+ /* Some configurations will use multi_devices, but keep the type of an individual device.
+ * This does simplify checks for homogenous setups, but here we really need a single device. */
+ return false;
+ }
+
+ /* Check the denoiser type is supported. */
+ return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+ Device *best_device = nullptr;
+
+ device->foreach_device([&](Device *sub_device) {
+ if ((sub_device->info.denoisers & type) == 0) {
+ return;
+ }
+ if (!best_device) {
+ best_device = sub_device;
+ }
+ else {
+ /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+ * of the device and data transfer cost. */
+ }
+ });
+
+ return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+ const uint device_type_mask)
+{
+ const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+ if (device_infos.empty()) {
+ return nullptr;
+ }
+
+ /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+ * a physical CUDA device which is already used for rendering. */
+
+ /* TODO(sergey): Choose fastest device for denoising. */
+
+ const DeviceInfo denoiser_device_info = device_infos.front();
+
+ unique_ptr<Device> denoiser_device(
+ Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+ if (!denoiser_device) {
+ return nullptr;
+ }
+
+ if (denoiser_device->have_error()) {
+ return nullptr;
+ }
+
+ /* Only need denoising feature, everything else is unused. */
+ if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+ return nullptr;
+ }
+
+ return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+ /* The best device has been found already, avoid sequential lookups.
+ * Additionally, avoid device re-creation if it has failed once. */
+ if (denoiser_device_ || device_creation_attempted_) {
+ return denoiser_device_;
+ }
+
+ /* Simple case: rendering happens on a single device which also supports denoiser. */
+ if (is_single_supported_device(path_trace_device_, params_.type)) {
+ denoiser_device_ = path_trace_device_;
+ return denoiser_device_;
+ }
+
+ /* Find best device from the ones which are already used for rendering. */
+ denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+ if (denoiser_device_) {
+ return denoiser_device_;
+ }
+
+ if (progress) {
+ progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+ }
+
+ device_creation_attempted_ = true;
+
+ const uint device_type_mask = get_device_type_mask();
+ local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+ denoiser_device_ = local_denoiser_device_.get();
+
+ return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..b02bcbeb046
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denoising algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+ /* Create denoiser for the given path trace device.
+ *
+ * Notes:
+ * - The denoiser must be configured. This means that `params.use` must be true.
+ * This is checked in debug builds.
+ * - The device might be MultiDevice. */
+ static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+ virtual ~Denoiser() = default;
+
+ void set_params(const DenoiseParams &params);
+ const DenoiseParams &get_params() const;
+
+ /* Create devices and load kernels needed for denoising.
+ * The progress is used to communicate state when kernels actually needs to be loaded.
+ *
+ * NOTE: The `progress` is an optional argument, can be nullptr. */
+ virtual bool load_kernels(Progress *progress);
+
+ /* Denoise the entire buffer.
+ *
+ * Buffer parameters denotes an effective parameters used during rendering. It could be
+ * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+ * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+ *
+ * The buffer might be coming from a "foreign" device from what this denoise is created for.
+ * This means that in general case the denoiser will make sure the input data is available on
+ * the denoiser device, perform denoising, and put data back to the device where the buffer
+ * came from.
+ *
+ * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+ * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+ * or which needs "final" value for data passes.
+ *
+ * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+ * modification of the input passes (scaling them down i.e.). This will lower the memory
+ * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+ * view.
+ *
+ * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+ * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) = 0;
+
+ /* Get a device which is used to perform actual denoising.
+ *
+ * Notes:
+ *
+ * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+ *
+ * - The device can be different from the path tracing device. This happens, for example, when
+ * using OptiX denoiser and rendering on CPU.
+ *
+ * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+ * that there is no threading-conflict between denoising task lazily initializing the device
+ * and access to this device happen. */
+ Device *get_denoiser_device() const;
+
+ function<bool(void)> is_cancelled_cb;
+
+ bool is_cancelled() const
+ {
+ if (!is_cancelled_cb) {
+ return false;
+ }
+ return is_cancelled_cb();
+ }
+
+ protected:
+ Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ /* Make sure denoising device is initialized. */
+ virtual Device *ensure_denoiser_device(Progress *progress);
+
+ /* Get device type mask which is used to filter available devices when new device needs to be
+ * created. */
+ virtual uint get_device_type_mask() const = 0;
+
+ Device *path_trace_device_;
+ DenoiseParams params_;
+
+ /* Cached pointer to the device on which denoising will happen.
+ * Used to avoid lookup of a device for every denoising request. */
+ Device *denoiser_device_ = nullptr;
+
+ /* Denoiser device which was created to perform denoising in the case the none of the rendering
+ * devices are capable of denoising. */
+ unique_ptr<Device> local_denoiser_device_;
+ bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..e8361c50f2f
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+ /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification)
+{
+ Device *denoiser_device = get_denoiser_device();
+ if (!denoiser_device) {
+ return false;
+ }
+
+ DeviceDenoiseTask task;
+ task.params = params_;
+ task.num_samples = num_samples;
+ task.buffer_params = buffer_params;
+ task.allow_inplace_modification = allow_inplace_modification;
+
+ RenderBuffers local_render_buffers(denoiser_device);
+ bool local_buffer_used = false;
+
+ if (denoiser_device == render_buffers->buffer.device) {
+ /* The device can access an existing buffer pointer. */
+ local_buffer_used = false;
+ task.render_buffers = render_buffers;
+ }
+ else {
+ VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+ DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+ /* Create buffer which is available by the device used by denoiser. */
+
+ /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+ * ignoring other light ad data passes. */
+
+ local_buffer_used = true;
+
+ render_buffers->copy_from_device();
+
+ local_render_buffers.reset(buffer_params);
+
+ /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+ * the input render buffer is allocated for the lowest resolution divider possible. So it is
+ * important to only copy actually needed part of the input buffer. */
+ memcpy(local_render_buffers.buffer.data(),
+ render_buffers->buffer.data(),
+ sizeof(float) * local_render_buffers.buffer.size());
+
+ queue->copy_to_device(local_render_buffers.buffer);
+
+ task.render_buffers = &local_render_buffers;
+ task.allow_inplace_modification = true;
+ }
+
+ const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+ if (local_buffer_used) {
+ local_render_buffers.copy_from_device();
+
+ render_buffers_host_copy_denoised(
+ render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+ render_buffers->copy_to_device();
+ }
+
+ return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+ DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+ ~DeviceDenoiser();
+
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..7fc2b2b1892
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : Denoiser(path_trace_device, params)
+{
+ DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+ DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+ OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+ return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+ OIDNPass() = default;
+
+ OIDNPass(const BufferParams &buffer_params,
+ const char *name,
+ PassType type,
+ PassMode mode = PassMode::NOISY)
+ : name(name), type(type), mode(mode)
+ {
+ offset = buffer_params.get_pass_offset(type, mode);
+ need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+ const PassInfo pass_info = Pass::get_info(type);
+ num_components = pass_info.num_components;
+ use_compositing = pass_info.use_compositing;
+ use_denoising_albedo = pass_info.use_denoising_albedo;
+ }
+
+ inline operator bool() const
+ {
+ return name[0] != '\0';
+ }
+
+ /* Name of an image which will be passed to the OIDN library.
+ * Should be one of the following: color, albedo, normal, output.
+ * The albedo and normal images are optional. */
+ const char *name = "";
+
+ PassType type = PASS_NONE;
+ PassMode mode = PassMode::NOISY;
+ int num_components = -1;
+ bool use_compositing = false;
+ bool use_denoising_albedo = true;
+
+ /* Offset of beginning of this pass in the render buffers. */
+ int offset = -1;
+
+ /* Denotes whether the data is to be scaled down with the number of passes.
+ * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+ * scaling is not needed for the color pass unless adaptive sampling is used.
+ *
+ * NOTE: Do not scale the output pass, as that requires to be a pointer in the original buffer.
+ * All the scaling on the output needed for integration with adaptive sampling will happen
+ * outside of generic pass handling. */
+ bool need_scale = false;
+
+ /* The content of the pass has been pre-filtered. */
+ bool is_filtered = false;
+
+ /* For the scaled passes, the data which holds values of scaled pixels. */
+ array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+ OIDNDenoiseContext(OIDNDenoiser *denoiser,
+ const DenoiseParams &denoise_params,
+ const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ const bool allow_inplace_modification)
+ : denoiser_(denoiser),
+ denoise_params_(denoise_params),
+ buffer_params_(buffer_params),
+ render_buffers_(render_buffers),
+ num_samples_(num_samples),
+ allow_inplace_modification_(allow_inplace_modification),
+ pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+ {
+ if (denoise_params_.use_pass_albedo) {
+ oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+ }
+
+ if (denoise_params_.use_pass_normal) {
+ oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+ }
+ }
+
+ bool need_denoising() const
+ {
+ if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /* Make the guiding passes available by a sequential denoising of various passes. */
+ void read_guiding_passes()
+ {
+ read_guiding_pass(oidn_albedo_pass_);
+ read_guiding_pass(oidn_normal_pass_);
+ }
+
+ void denoise_pass(const PassType pass_type)
+ {
+ OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+ if (oidn_color_pass.offset == PASS_UNUSED) {
+ return;
+ }
+
+ if (oidn_color_pass.use_denoising_albedo) {
+ if (albedo_replaced_with_fake_) {
+ LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+ return;
+ }
+ }
+
+ OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+ if (oidn_output_pass.offset == PASS_UNUSED) {
+ LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+ return;
+ }
+
+ OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+ oidn::DeviceRef oidn_device = oidn::newDevice();
+ oidn_device.commit();
+
+ /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+ */
+ oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+ set_input_pass(oidn_filter, oidn_color_access_pass);
+ set_guiding_passes(oidn_filter, oidn_color_pass);
+ set_output_pass(oidn_filter, oidn_output_pass);
+ oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+ oidn_filter.set("hdr", true);
+ oidn_filter.set("srgb", false);
+ if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+ denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+ oidn_filter.set("cleanAux", true);
+ }
+ oidn_filter.commit();
+
+ filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+ filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+ /* Filter the beauty image. */
+ oidn_filter.execute();
+
+ /* Check for errors. */
+ const char *error_message;
+ const oidn::Error error = oidn_device.getError(error_message);
+ if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+ LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+ }
+
+ postprocess_output(oidn_color_pass, oidn_output_pass);
+ }
+
+ protected:
+ void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+ {
+ if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+ oidn_pass.is_filtered) {
+ return;
+ }
+
+ oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+ set_pass(oidn_filter, oidn_pass);
+ set_output_pass(oidn_filter, oidn_pass);
+ oidn_filter.commit();
+ oidn_filter.execute();
+
+ oidn_pass.is_filtered = true;
+ }
+
+ /* Make pixels of a guiding pass available by the denoiser. */
+ void read_guiding_pass(OIDNPass &oidn_pass)
+ {
+ if (!oidn_pass) {
+ return;
+ }
+
+ DCHECK(!oidn_pass.use_compositing);
+
+ if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+ !is_pass_scale_needed(oidn_pass)) {
+ /* Pass data is available as-is from the render buffers. */
+ return;
+ }
+
+ if (allow_inplace_modification_) {
+ scale_pass_in_render_buffers(oidn_pass);
+ return;
+ }
+
+ read_pass_pixels_into_buffer(oidn_pass);
+ }
+
+ /* Special reader of the input pass.
+ * To save memory it will read pixels into the output, and let the denoiser to perform an
+ * in-place operation. */
+ OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+ {
+ const bool use_compositing = oidn_input_pass.use_compositing;
+
+ /* Simple case: no compositing is involved, no scaling is needed.
+ * The pass pixels will be referenced as-is, without extra processing. */
+ if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+ return oidn_input_pass;
+ }
+
+ float *buffer_data = render_buffers_->buffer.data();
+ float *pass_data = buffer_data + oidn_output_pass.offset;
+
+ PassAccessor::Destination destination(pass_data, 3);
+ destination.pixel_stride = buffer_params_.pass_stride;
+
+ read_pass_pixels(oidn_input_pass, destination);
+
+ OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+ oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+ return oidn_input_pass_at_output;
+ }
+
+ /* Read pass pixels using PassAccessor into the given destination. */
+ void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+ {
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = oidn_pass.type;
+ pass_access_info.mode = oidn_pass.mode;
+ pass_access_info.offset = oidn_pass.offset;
+
+ /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+ * on the approximation. The latter is not even possible because OIDN does not support
+ * denoising of semi-transparent pixels. */
+ pass_access_info.use_approximate_shadow_catcher = false;
+ pass_access_info.use_approximate_shadow_catcher_background = false;
+ pass_access_info.show_active_pixels = false;
+
+ /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+ * by users. What is important is to use same exposure for read and write access of the pass
+ * pixels. */
+ const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+ pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+ }
+
+ /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+ void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+ {
+ VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+ << pass_type_as_string(oidn_pass.type) << ")";
+
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+ scaled_buffer.resize(width * height * 3);
+
+ const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+ read_pass_pixels(oidn_pass, destination);
+ }
+
+ /* Set OIDN image to reference pixels from the given render buffer pass.
+ * No transform to the pixels is done, no additional memory is used. */
+ void set_pass_referenced(oidn::FilterRef &oidn_filter,
+ const char *name,
+ const OIDNPass &oidn_pass)
+ {
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+
+ const int64_t pixel_index = offset + x + y * stride;
+ const int64_t buffer_offset = pixel_index * pass_stride;
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ oidn_filter.setImage(name,
+ buffer_data + buffer_offset + oidn_pass.offset,
+ oidn::Format::Float3,
+ width,
+ height,
+ 0,
+ pass_stride * sizeof(float),
+ stride * pass_stride * sizeof(float));
+ }
+
+ void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+ {
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ oidn_filter.setImage(
+ name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+ }
+
+ void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+ }
+ void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+ {
+ if (oidn_pass.scaled_buffer.empty()) {
+ set_pass_referenced(oidn_filter, name, oidn_pass);
+ }
+ else {
+ set_pass_from_buffer(oidn_filter, name, oidn_pass);
+ }
+ }
+
+ void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+ }
+
+ void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ if (oidn_albedo_pass_) {
+ if (oidn_pass.use_denoising_albedo) {
+ set_pass(oidn_filter, oidn_albedo_pass_);
+ }
+ else {
+ /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+ * provided. */
+ set_fake_albedo_pass(oidn_filter);
+ }
+ }
+
+ if (oidn_normal_pass_) {
+ set_pass(oidn_filter, oidn_normal_pass_);
+ }
+ }
+
+ void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+ {
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+
+ if (!albedo_replaced_with_fake_) {
+ const int64_t num_pixel_components = width * height * 3;
+ oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+ for (int i = 0; i < num_pixel_components; ++i) {
+ oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+ }
+
+ albedo_replaced_with_fake_ = true;
+ }
+
+ set_pass(oidn_filter, oidn_albedo_pass_);
+ }
+
+ void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+ {
+ set_pass(oidn_filter, "output", oidn_pass);
+ }
+
+ /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+ * back. */
+ void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+ {
+ kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+ const int64_t row_stride = stride * pass_stride;
+
+ const int64_t pixel_offset = offset + x + y * stride;
+ const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+ const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+ for (int y = 0; y < height; ++y) {
+ float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+ for (int x = 0; x < width; ++x) {
+ float *buffer_pixel = buffer_row + x * pass_stride;
+ float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+ if (need_scale) {
+ const float pixel_scale = has_pass_sample_count ?
+ __float_as_uint(buffer_pixel[pass_sample_count_]) :
+ num_samples_;
+
+ denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+ denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+ denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+ }
+
+ if (oidn_output_pass.num_components == 3) {
+ /* Pass without alpha channel. */
+ }
+ else if (!oidn_input_pass.use_compositing) {
+ /* Currently compositing passes are either 3-component (derived by dividing light passes)
+ * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+ * simplifies logic and avoids extra memory allocation. */
+ const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+ denoised_pixel[3] = noisy_pixel[3];
+ }
+ else {
+ /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+ * is an opaque pixel for 4 component passes. */
+ denoised_pixel[3] = 0;
+ }
+ }
+ }
+ }
+
+ bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+ {
+ if (pass_sample_count_ != PASS_UNUSED) {
+ /* With adaptive sampling pixels will have different number of samples in them, so need to
+ * always scale the pass to make pixels uniformly sampled. */
+ return true;
+ }
+
+ if (!oidn_pass.need_scale) {
+ return false;
+ }
+
+ if (num_samples_ == 1) {
+ /* If the avoid scaling if there is only one sample, to save up time (so we don't divide
+ * buffer by 1). */
+ return false;
+ }
+
+ return true;
+ }
+
+ void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+ {
+ const int64_t x = buffer_params_.full_x;
+ const int64_t y = buffer_params_.full_y;
+ const int64_t width = buffer_params_.width;
+ const int64_t height = buffer_params_.height;
+ const int64_t offset = buffer_params_.offset;
+ const int64_t stride = buffer_params_.stride;
+ const int64_t pass_stride = buffer_params_.pass_stride;
+ const int64_t row_stride = stride * pass_stride;
+
+ const int64_t pixel_offset = offset + x + y * stride;
+ const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+ float *buffer_data = render_buffers_->buffer.data();
+
+ const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+ for (int y = 0; y < height; ++y) {
+ float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+ for (int x = 0; x < width; ++x) {
+ float *buffer_pixel = buffer_row + x * pass_stride;
+ float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+ const float pixel_scale = 1.0f / (has_pass_sample_count ?
+ __float_as_uint(buffer_pixel[pass_sample_count_]) :
+ num_samples_);
+
+ pass_pixel[0] = pass_pixel[0] * pixel_scale;
+ pass_pixel[1] = pass_pixel[1] * pixel_scale;
+ pass_pixel[2] = pass_pixel[2] * pixel_scale;
+ }
+ }
+ }
+
+ OIDNDenoiser *denoiser_ = nullptr;
+
+ const DenoiseParams &denoise_params_;
+ const BufferParams &buffer_params_;
+ RenderBuffers *render_buffers_ = nullptr;
+ int num_samples_ = 0;
+ bool allow_inplace_modification_ = false;
+ int pass_sample_count_ = PASS_UNUSED;
+
+ /* Optional albedo and normal passes, reused by denoising of different pass types. */
+ OIDNPass oidn_albedo_pass_;
+ OIDNPass oidn_normal_pass_;
+
+ /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+ * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+ * the fake values and denoising of passes which do need albedo can no longer happen. */
+ bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+ Device *device = render_buffers->buffer.device;
+ if (device->info.has_gpu_queue) {
+ return device->gpu_queue_create();
+ }
+ return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+ RenderBuffers *render_buffers)
+{
+ if (queue) {
+ queue->copy_from_device(render_buffers->buffer);
+ queue->synchronize();
+ }
+ else {
+ render_buffers->copy_from_device();
+ }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+ RenderBuffers *render_buffers)
+{
+ if (queue) {
+ queue->copy_to_device(render_buffers->buffer);
+ queue->synchronize();
+ }
+ else {
+ render_buffers->copy_to_device();
+ }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification)
+{
+ thread_scoped_lock lock(mutex_);
+
+ /* Make sure the host-side data is available for denoising. */
+ unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+ copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+ OIDNDenoiseContext context(
+ this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+ if (context.need_denoising()) {
+ context.read_guiding_passes();
+
+ const std::array<PassType, 3> passes = {
+ {/* Passes which will use real albedo when it is available. */
+ PASS_COMBINED,
+ PASS_SHADOW_CATCHER_MATTE,
+
+ /* Passes which do not need albedo and hence if real is present it needs to become fake.
+ */
+ PASS_SHADOW_CATCHER}};
+
+ for (const PassType pass_type : passes) {
+ context.denoise_pass(pass_type);
+ if (is_cancelled()) {
+ return false;
+ }
+ }
+
+ /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+ * copies data from the device it doesn't overwrite the denoiser buffers. */
+ copy_render_buffers_to_device(queue, render_buffers);
+ }
+#endif
+
+ /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+ * it up here all passes are properly denoised. */
+ return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+ return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+ /* Forwardly declared state which might be using compile-flag specific fields, such as
+ * OpenImageDenoise device and filter handles. */
+ class State;
+
+ OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ virtual bool denoise_buffer(const BufferParams &buffer_params,
+ RenderBuffers *render_buffers,
+ const int num_samples,
+ bool allow_inplace_modification) override;
+
+ protected:
+ virtual uint get_device_type_mask() const override;
+
+ /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+ * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+ static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/integrator/denoiser_optix.cpp
index ed64ae01aae..5f9de23bfe6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,13 +14,21 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_direct_lighting.h"
+#include "integrator/denoiser_optix.h"
-#define KERNEL_NAME direct_lighting
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "device/device.h"
+#include "device/device_denoise.h"
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+ : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+ return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/integrator/denoiser_optix.h
index c314dc96c33..a8df770ecf7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_lamp_emission.h"
+#pragma once
-#define KERNEL_NAME lamp_emission
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "integrator/denoiser_device.h"
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+ OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+ virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+ : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+ : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+ : Destination(pass_type)
+{
+ pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+ const PassInfo pass_info = Pass::get_info(pass_type);
+ num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+ : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+ : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const Destination &destination) const
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+ const PassAccessor::Destination &destination,
+ const int src_num_components)
+{
+ /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+ * fill in the additional components for convenience. */
+ const int dest_num_components = destination.num_components;
+
+ if (src_num_components >= dest_num_components) {
+ return;
+ }
+
+ const size_t size = buffer_params.width * buffer_params.height;
+ if (destination.pixels) {
+ float *pixel = destination.pixels;
+
+ for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+ if (dest_num_components >= 3 && src_num_components == 1) {
+ pixel[1] = pixel[0];
+ pixel[2] = pixel[0];
+ }
+ if (dest_num_components >= 4) {
+ pixel[3] = 1.0f;
+ }
+ }
+ }
+
+ if (destination.pixels_half_rgba) {
+ const half one = float_to_half(1.0f);
+ half4 *pixel = destination.pixels_half_rgba;
+
+ for (size_t i = 0; i < size; i++, pixel++) {
+ if (dest_num_components >= 3 && src_num_components == 1) {
+ pixel[0].y = pixel[0].x;
+ pixel[0].z = pixel[0].x;
+ }
+ if (dest_num_components >= 4) {
+ pixel[0].w = one;
+ }
+ }
+ }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ if (pass_access_info_.offset == PASS_UNUSED) {
+ return false;
+ }
+
+ const PassType type = pass_access_info_.type;
+ const PassMode mode = pass_access_info_.mode;
+ const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+ if (pass_info.num_components == 1) {
+ /* Single channel passes. */
+ if (mode == PassMode::DENOISED) {
+ /* Denoised passes store their final pixels, no need in special calculation. */
+ get_pass_float(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_RENDER_TIME) {
+ /* TODO(sergey): Needs implementation. */
+ }
+ else if (type == PASS_DEPTH) {
+ get_pass_depth(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_MIST) {
+ get_pass_mist(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_SAMPLE_COUNT) {
+ get_pass_sample_count(render_buffers, buffer_params, destination);
+ }
+ else {
+ get_pass_float(render_buffers, buffer_params, destination);
+ }
+ }
+ else if (type == PASS_MOTION) {
+ /* Motion pass. */
+ DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+ get_pass_motion(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_CRYPTOMATTE) {
+ /* Cryptomatte pass. */
+ DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+ get_pass_cryptomatte(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* RGB, RGBA and vector passes. */
+ DCHECK(destination.num_components == 3 || destination.num_components == 4)
+ << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+ if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+ /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+ * to approximate shadow with). */
+ get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+ /* Shadow catcher pass. */
+ get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+ }
+ else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+ pass_info.indirect_type != PASS_NONE) &&
+ mode != PassMode::DENOISED) {
+ /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+ get_pass_light_path(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* Passes that need no special computation, or denoised passes that already
+ * had the computation done. */
+ if (pass_info.num_components == 3) {
+ get_pass_float3(render_buffers, buffer_params, destination);
+ }
+ else if (pass_info.num_components == 4) {
+ if (destination.num_components == 3) {
+ /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+ get_pass_float3(render_buffers, buffer_params, destination);
+ }
+ else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+ type == PASS_SHADOW_CATCHER_MATTE) {
+ /* Passes with transparency as 4th component. */
+ get_pass_combined(render_buffers, buffer_params, destination);
+ }
+ else {
+ /* Passes with alpha as 4th component. */
+ get_pass_float4(render_buffers, buffer_params, destination);
+ }
+ }
+ }
+ }
+
+ pad_pixels(buffer_params, destination, pass_info.num_components);
+
+ return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ const PassMode mode = pass_access_info_.mode;
+ const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+ pass_access_info_.include_albedo);
+
+ kfilm_convert->pass_offset = pass_access_info_.offset;
+ kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+ kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+ kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+ /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+ if (pass_info.direct_type != PASS_NONE) {
+ kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+ }
+ kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+ kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+ kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+ kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+ kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+ PASS_ADAPTIVE_AUX_BUFFER);
+ kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+ kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+ kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+ PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+ kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+ PASS_SHADOW_CATCHER_MATTE, mode);
+
+ /* Background is not denoised, so always use noisy pass. */
+ kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+ if (pass_info.use_filter) {
+ kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+ }
+ else {
+ kfilm_convert->scale = 1.0f;
+ }
+
+ if (pass_info.use_exposure) {
+ kfilm_convert->exposure = exposure_;
+ }
+ else {
+ kfilm_convert->exposure = 1.0f;
+ }
+
+ kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+ kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+ kfilm_convert->use_approximate_shadow_catcher_background =
+ pass_access_info_.use_approximate_shadow_catcher_background;
+ kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+ kfilm_convert->num_components = destination.num_components;
+ kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+ destination.num_components;
+
+ kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+ if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+ return false;
+ }
+
+ const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+ pass_access_info_.include_albedo);
+
+ const BufferParams &buffer_params = render_buffers->params;
+
+ float *buffer_data = render_buffers->buffer.data();
+ const int size = buffer_params.width * buffer_params.height;
+
+ const int out_stride = buffer_params.pass_stride;
+ const int in_stride = source.num_components;
+ const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+ float *out = buffer_data + pass_access_info_.offset;
+ const float *in = source.pixels + source.offset * in_stride;
+
+ for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+ memcpy(out, in, sizeof(float) * num_components_to_copy);
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+ class PassAccessInfo {
+ public:
+ PassAccessInfo() = default;
+ explicit PassAccessInfo(const BufferPass &pass);
+
+ PassType type = PASS_NONE;
+ PassMode mode = PassMode::NOISY;
+ bool include_albedo = false;
+ int offset = -1;
+
+ /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+ * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+ */
+ bool use_approximate_shadow_catcher = false;
+
+ /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+ bool use_approximate_shadow_catcher_background = false;
+
+ bool show_active_pixels = false;
+ };
+
+ class Destination {
+ public:
+ Destination() = default;
+ Destination(float *pixels, int num_components);
+ Destination(const PassType pass_type, half4 *pixels);
+
+ /* Destination will be initialized with the number of components which is native for the given
+ * pass type. */
+ explicit Destination(const PassType pass_type);
+
+ /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+ float *pixels = nullptr;
+ half4 *pixels_half_rgba = nullptr;
+
+ /* Device-side pointers. */
+ device_ptr d_pixels = 0;
+ device_ptr d_pixels_half_rgba = 0;
+
+ /* Number of components per pixel in the floating-point destination.
+ * Is ignored for half4 destination (where number of components is implied to be 4). */
+ int num_components = 0;
+
+ /* Offset in pixels from the beginning of pixels storage.
+ * Allows to get pixels of render buffer into a partial slice of the destination. */
+ int offset = 0;
+
+ /* Number of floats per pixel. When zero is the same as `num_components`.
+ *
+ * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+ * half-floats. */
+ int pixel_stride = 0;
+
+ /* Row stride in pixel elements:
+ * - For the float destination stride is a number of floats per row.
+ * - For the half4 destination stride is a number of half4 per row. */
+ int stride = 0;
+ };
+
+ class Source {
+ public:
+ Source() = default;
+ Source(const float *pixels, int num_components);
+
+ /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+ const float *pixels = nullptr;
+ int num_components = 0;
+
+ /* Offset in pixels from the beginning of pixels storage.
+ * Allows to get pixels of render buffer into a partial slice of the destination. */
+ int offset = 0;
+ };
+
+ PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+ virtual ~PassAccessor() = default;
+
+ /* Get pass data from the given render buffers, perform needed filtering, and store result into
+ * the pixels.
+ * The result is stored sequentially starting from the very beginning of the pixels memory. */
+ bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const Destination &destination) const;
+ bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+ /* Set pass data for the given render buffers. Used for baking to read from passes. */
+ bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+ virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const = 0;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth)
+ DECLARE_PASS_ACCESSOR(mist)
+ DECLARE_PASS_ACCESSOR(sample_count)
+ DECLARE_PASS_ACCESSOR(float)
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path)
+ DECLARE_PASS_ACCESSOR(shadow_catcher)
+ DECLARE_PASS_ACCESSOR(float3)
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion)
+ DECLARE_PASS_ACCESSOR(cryptomatte)
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+ DECLARE_PASS_ACCESSOR(combined)
+ DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+ PassAccessInfo pass_access_info_;
+
+ float exposure_ = 0.0f;
+ int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ KernelFilmConvert kfilm_convert;
+ init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+ if (destination.pixels) {
+ /* NOTE: No overlays are applied since they are not used for final renders.
+ * Can be supported via some sort of specialization to avoid code duplication. */
+
+ run_get_pass_kernel_processor_float(
+ &kfilm_convert, render_buffers, buffer_params, destination, processor);
+ }
+
+ if (destination.pixels_half_rgba) {
+ /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+ if (destination.num_components == 1) {
+ run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+ render_buffers,
+ buffer_params,
+ destination,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ float pixel;
+ processor(kfilm_convert, buffer, &pixel);
+
+ pixel_rgba[0] = pixel;
+ pixel_rgba[1] = pixel;
+ pixel_rgba[2] = pixel;
+ pixel_rgba[3] = 1.0f;
+ });
+ }
+ else if (destination.num_components == 3) {
+ run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+ render_buffers,
+ buffer_params,
+ destination,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ processor(kfilm_convert, buffer, pixel_rgba);
+ pixel_rgba[3] = 1.0f;
+ });
+ }
+ else if (destination.num_components == 4) {
+ run_get_pass_kernel_processor_half_rgba(
+ &kfilm_convert, render_buffers, buffer_params, destination, processor);
+ }
+ }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+ const float *buffer_data = render_buffers->buffer.data();
+ const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+ destination.num_components;
+
+ tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+ int64_t pixel_index = y * buffer_params.width;
+ for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+ const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+ const float *buffer = buffer_data + input_pixel_offset;
+ float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+ processor(kfilm_convert, buffer, pixel);
+ }
+ });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+ const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const
+{
+ const float *buffer_data = render_buffers->buffer.data();
+
+ half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+ const int destination_stride = destination.stride != 0 ? destination.stride :
+ buffer_params.width;
+
+ tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+ int64_t pixel_index = y * buffer_params.width;
+ half4 *dst_row_start = dst_start + y * destination_stride;
+ for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+ const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+ const float *buffer = buffer_data + input_pixel_offset;
+
+ float pixel[4];
+ processor(kfilm_convert, buffer, pixel);
+
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+ half4 *pixel_half_rgba = dst_row_start + x;
+ float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+ }
+ });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+ void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const \
+ { \
+ run_get_pass_kernel_processor( \
+ render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+ }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+ using PassAccessor::PassAccessor;
+
+ protected:
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+ template<typename Processor>
+ inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination,
+ const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const override;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth)
+ DECLARE_PASS_ACCESSOR(mist)
+ DECLARE_PASS_ACCESSOR(sample_count)
+ DECLARE_PASS_ACCESSOR(float)
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path)
+ DECLARE_PASS_ACCESSOR(shadow_catcher)
+ DECLARE_PASS_ACCESSOR(float3)
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion)
+ DECLARE_PASS_ACCESSOR(cryptomatte)
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+ DECLARE_PASS_ACCESSOR(combined)
+ DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+ const PassAccessInfo &pass_access_info,
+ float exposure,
+ int num_samples)
+ : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const
+{
+ KernelFilmConvert kfilm_convert;
+ init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+ const int work_size = buffer_params.width * buffer_params.height;
+
+ const int destination_stride = destination.stride != 0 ? destination.stride :
+ buffer_params.width;
+
+ if (destination.d_pixels) {
+ DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+ void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+ const_cast<device_ptr *>(&destination.d_pixels),
+ const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+ const_cast<int *>(&work_size),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&destination.offset),
+ const_cast<int *>(&destination_stride)};
+
+ queue_->enqueue(kernel, work_size, args);
+ }
+ if (destination.d_pixels_half_rgba) {
+ const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+ void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+ const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+ const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+ const_cast<int *>(&work_size),
+ const_cast<int *>(&buffer_params.width),
+ const_cast<int *>(&buffer_params.offset),
+ const_cast<int *>(&buffer_params.stride),
+ const_cast<int *>(&destination.offset),
+ const_cast<int *>(&destination_stride)};
+
+ queue_->enqueue(kernel_half_float, work_size, args);
+ }
+
+ queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+ void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const \
+ { \
+ run_film_convert_kernels( \
+ DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+ }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+ PassAccessorGPU(DeviceQueue *queue,
+ const PassAccessInfo &pass_access_info,
+ float exposure,
+ int num_samples);
+
+ protected:
+ void run_film_convert_kernels(DeviceKernel kernel,
+ const RenderBuffers *render_buffers,
+ const BufferParams &buffer_params,
+ const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+ virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+ const BufferParams &buffer_params, \
+ const Destination &destination) const override;
+
+ /* Float (scalar) passes. */
+ DECLARE_PASS_ACCESSOR(depth);
+ DECLARE_PASS_ACCESSOR(mist);
+ DECLARE_PASS_ACCESSOR(sample_count);
+ DECLARE_PASS_ACCESSOR(float);
+
+ /* Float3 passes. */
+ DECLARE_PASS_ACCESSOR(light_path);
+ DECLARE_PASS_ACCESSOR(float3);
+
+ /* Float4 passes. */
+ DECLARE_PASS_ACCESSOR(motion);
+ DECLARE_PASS_ACCESSOR(cryptomatte);
+ DECLARE_PASS_ACCESSOR(shadow_catcher);
+ DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+ DECLARE_PASS_ACCESSOR(combined);
+ DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+ DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..b62a06aea43
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ RenderScheduler &render_scheduler,
+ TileManager &tile_manager)
+ : device_(device),
+ device_scene_(device_scene),
+ render_scheduler_(render_scheduler),
+ tile_manager_(tile_manager)
+{
+ DCHECK_NE(device_, nullptr);
+
+ {
+ vector<DeviceInfo> cpu_devices;
+ device_cpu_info(cpu_devices);
+
+ cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+ }
+
+ /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+ * as possible. */
+ device_->foreach_device([&](Device *path_trace_device) {
+ path_trace_works_.emplace_back(PathTraceWork::create(
+ path_trace_device, film, device_scene, &render_cancel_.is_requested));
+ });
+
+ work_balance_infos_.resize(path_trace_works_.size());
+ work_balance_do_initial(work_balance_infos_);
+
+ render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+ /* Destroy any GPU resource which was used for graphics interop.
+ * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+ * used for interop. */
+ if (gpu_display_) {
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->destroy_gpu_resources(gpu_display_.get());
+ }
+ }
+}
+
+void PathTrace::load_kernels()
+{
+ if (denoiser_) {
+ denoiser_->load_kernels(progress_);
+ }
+}
+
+void PathTrace::alloc_work_memory()
+{
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->alloc_work_memory();
+ }
+}
+
+bool PathTrace::ready_to_reset()
+{
+ /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+ * display. Of there is no such display, the logic here will break. */
+ DCHECK(gpu_display_);
+
+ /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+ * General idea is to be able to reset as quickly as possible, while still providing interactive
+ * feel.
+ *
+ * If the render result was ever drawn after previous reset, consider that reset is now possible.
+ * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+ * whether CPU or GPU drawing pipeline is used.
+ *
+ * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+ * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+ if (did_draw_after_reset_) {
+ return true;
+ }
+
+ return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+ if (big_tile_params_.modified(big_tile_params)) {
+ big_tile_params_ = big_tile_params;
+ render_state_.need_reset_params = true;
+ }
+
+ full_params_ = full_params;
+
+ /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+ * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+ * properly updated. */
+ if (gpu_display_) {
+ gpu_display_->reset(full_params);
+ }
+
+ render_state_.has_denoised_result = false;
+ render_state_.tile_written = false;
+
+ did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+ /* Free render buffers used by the path trace work to reduce memory peak. */
+ BufferParams empty_params;
+ empty_params.pass_stride = 0;
+ empty_params.update_offset_stride();
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->get_render_buffers()->reset(empty_params);
+ }
+ render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+ progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+ /* Indicate that rendering has started and that it can be requested to cancel. */
+ {
+ thread_scoped_lock lock(render_cancel_.mutex);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+ render_cancel_.is_rendering = true;
+ }
+
+ render_pipeline(render_work);
+
+ /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+ * on. */
+ {
+ thread_scoped_lock lock(render_cancel_.mutex);
+ render_cancel_.is_rendering = false;
+ render_cancel_.condition.notify_one();
+ }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+ /* NOTE: Only check for "instant" cancel here. The user-requested cancel via progress is
+ * checked in Session and the work in the event of cancel is to be finished here. */
+
+ render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+ 0);
+
+ render_init_kernel_execution();
+
+ render_scheduler_.report_work_begin(render_work);
+
+ init_render_buffers(render_work);
+
+ rebalance(render_work);
+
+ path_trace(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ adaptive_sample(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ cryptomatte_postprocess(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ denoise(render_work);
+ if (render_cancel_.is_requested) {
+ return;
+ }
+
+ write_tile_buffer(render_work);
+ update_display(render_work);
+
+ progress_update_if_needed(render_work);
+
+ finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->init_execution();
+ }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+ const vector<WorkBalanceInfo> &work_balance_infos,
+ const BufferParams &buffer_params,
+ const Callback &callback)
+{
+ const int num_works = path_trace_works.size();
+ const int height = buffer_params.height;
+
+ int current_y = 0;
+ for (int i = 0; i < num_works; ++i) {
+ const double weight = work_balance_infos[i].weight;
+ const int slice_height = max(lround(height * weight), 1);
+
+ /* Disallow negative values to deal with situations when there are more compute devices than
+ * scanlines. */
+ const int remaining_height = max(0, height - current_y);
+
+ BufferParams slide_params = buffer_params;
+ slide_params.full_y = buffer_params.full_y + current_y;
+ if (i < num_works - 1) {
+ slide_params.height = min(slice_height, remaining_height);
+ }
+ else {
+ slide_params.height = remaining_height;
+ }
+
+ slide_params.update_offset_stride();
+
+ callback(path_trace_works[i].get(), slide_params);
+
+ current_y += slide_params.height;
+ }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+ foreach_sliced_buffer_params(path_trace_works_,
+ work_balance_infos_,
+ big_tile_params_,
+ [](PathTraceWork *path_trace_work, const BufferParams &params) {
+ RenderBuffers *buffers = path_trace_work->get_render_buffers();
+ buffers->reset(params);
+ });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+ BufferParams scaled_params = params;
+
+ scaled_params.width = max(1, params.width / resolution_divider);
+ scaled_params.height = max(1, params.height / resolution_divider);
+ scaled_params.full_x = params.full_x / resolution_divider;
+ scaled_params.full_y = params.full_y / resolution_divider;
+ scaled_params.full_width = params.full_width / resolution_divider;
+ scaled_params.full_height = params.full_height / resolution_divider;
+
+ scaled_params.update_offset_stride();
+
+ return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+ const int resolution_divider = render_work.resolution_divider;
+
+ const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+ const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+ resolution_divider);
+
+ foreach_sliced_buffer_params(path_trace_works_,
+ work_balance_infos_,
+ scaled_big_tile_params,
+ [&](PathTraceWork *path_trace_work, const BufferParams params) {
+ path_trace_work->set_effective_buffer_params(
+ scaled_full_params, scaled_big_tile_params, params);
+ });
+
+ render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+ if (render_state_.need_reset_params) {
+ update_allocated_work_buffer_params();
+ }
+
+ if (render_state_.need_reset_params ||
+ render_state_.resolution_divider != render_work.resolution_divider) {
+ update_effective_work_buffer_params(render_work);
+ }
+
+ render_state_.resolution_divider = render_work.resolution_divider;
+ render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+ update_work_buffer_params_if_needed(render_work);
+
+ /* Handle initialization scheduled by the render scheduler. */
+ if (render_work.init_render_buffers) {
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->zero_render_buffers();
+ });
+
+ tile_buffer_read();
+ }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+ if (!render_work.path_trace.num_samples) {
+ return;
+ }
+
+ VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+ << " samples at the resolution divider " << render_work.resolution_divider;
+
+ const double start_time = time_dt();
+
+ const int num_works = path_trace_works_.size();
+
+ tbb::parallel_for(0, num_works, [&](int i) {
+ const double work_start_time = time_dt();
+ const int num_samples = render_work.path_trace.num_samples;
+
+ PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+ PathTraceWork::RenderStatistics statistics;
+ path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+ const double work_time = time_dt() - work_start_time;
+ work_balance_infos_[i].time_spent += work_time;
+ work_balance_infos_[i].occupancy = statistics.occupancy;
+
+ VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+ << work_time / num_samples
+ << " seconds per sample), occupancy: " << statistics.occupancy;
+ });
+
+ float occupancy_accum = 0.0f;
+ for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+ occupancy_accum += balance_info.occupancy;
+ }
+ const float occupancy = occupancy_accum / num_works;
+ render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+ render_scheduler_.report_path_trace_time(
+ render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+ if (!render_work.adaptive_sampling.filter) {
+ return;
+ }
+
+ bool did_reschedule_on_idle = false;
+
+ while (true) {
+ VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+ << render_work.adaptive_sampling.threshold;
+ if (render_work.adaptive_sampling.reset) {
+ VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+ }
+
+ const double start_time = time_dt();
+
+ uint num_active_pixels = 0;
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ const uint num_active_pixels_in_work =
+ path_trace_work->adaptive_sampling_converge_filter_count_active(
+ render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+ if (num_active_pixels_in_work) {
+ atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+ }
+ });
+
+ render_scheduler_.report_adaptive_filter_time(
+ render_work, time_dt() - start_time, is_cancel_requested());
+
+ if (num_active_pixels == 0) {
+ VLOG(3) << "All pixels converged.";
+ if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+ break;
+ }
+ VLOG(3) << "Continuing with lower threshold.";
+ }
+ else if (did_reschedule_on_idle) {
+ break;
+ }
+ else if (num_active_pixels < 128 * 128) {
+ /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+ * there is no performance loss from the progressive noise floor feature.
+ *
+ * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+ * the final resolution. */
+ if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+ VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+ break;
+ }
+ VLOG(3) << "Rescheduling lower threshold.";
+ did_reschedule_on_idle = true;
+ }
+ else {
+ break;
+ }
+ }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+ render_scheduler_.set_denoiser_params(params);
+
+ if (!params.use) {
+ denoiser_.reset();
+ return;
+ }
+
+ if (denoiser_) {
+ const DenoiseParams old_denoiser_params = denoiser_->get_params();
+ if (old_denoiser_params.type == params.type) {
+ denoiser_->set_params(params);
+ return;
+ }
+ }
+
+ denoiser_ = Denoiser::create(device_, params);
+ denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+ render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+ if (!render_work.cryptomatte.postprocess) {
+ return;
+ }
+ VLOG(3) << "Perform cryptomatte work.";
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->cryptomatte_postproces();
+ });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+ if (!render_work.tile.denoise) {
+ return;
+ }
+
+ if (!denoiser_) {
+ /* Denoiser was not configured, so nothing to do here. */
+ return;
+ }
+
+ VLOG(3) << "Perform denoising work.";
+
+ const double start_time = time_dt();
+
+ RenderBuffers *buffer_to_denoise = nullptr;
+
+ unique_ptr<RenderBuffers> multi_device_buffers;
+ bool allow_inplace_modification = false;
+
+ if (path_trace_works_.size() == 1) {
+ buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+ }
+ else {
+ Device *denoiser_device = denoiser_->get_denoiser_device();
+ if (!denoiser_device) {
+ return;
+ }
+
+ multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+ multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+ buffer_to_denoise = multi_device_buffers.get();
+
+ copy_to_render_buffers(multi_device_buffers.get());
+
+ allow_inplace_modification = true;
+ }
+
+ if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+ buffer_to_denoise,
+ get_num_samples_in_buffer(),
+ allow_inplace_modification)) {
+ render_state_.has_denoised_result = true;
+ }
+
+ if (multi_device_buffers) {
+ multi_device_buffers->copy_from_device();
+ tbb::parallel_for_each(
+ path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+ });
+ }
+
+ render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+ gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+ if (gpu_display_) {
+ gpu_display_->clear();
+ }
+}
+
+void PathTrace::draw()
+{
+ if (!gpu_display_) {
+ return;
+ }
+
+ did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+ if (!render_work.display.update) {
+ return;
+ }
+
+ if (!gpu_display_ && !tile_buffer_update_cb) {
+ VLOG(3) << "Ignore display update.";
+ return;
+ }
+
+ if (full_params_.width == 0 || full_params_.height == 0) {
+ VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+ return;
+ }
+
+ const double start_time = time_dt();
+
+ if (tile_buffer_update_cb) {
+ VLOG(3) << "Invoke buffer update callback.";
+
+ tile_buffer_update_cb();
+ }
+
+ if (gpu_display_) {
+ VLOG(3) << "Perform copy to GPUDisplay work.";
+
+ const int resolution_divider = render_work.resolution_divider;
+ const int texture_width = max(1, full_params_.width / resolution_divider);
+ const int texture_height = max(1, full_params_.height / resolution_divider);
+ if (!gpu_display_->update_begin(texture_width, texture_height)) {
+ LOG(ERROR) << "Error beginning GPUDisplay update.";
+ return;
+ }
+
+ const PassMode pass_mode = render_work.display.use_denoised_result &&
+ render_state_.has_denoised_result ?
+ PassMode::DENOISED :
+ PassMode::NOISY;
+
+ /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+ * all works in parallel. */
+ const int num_samples = get_num_samples_in_buffer();
+ for (auto &&path_trace_work : path_trace_works_) {
+ path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+ }
+
+ gpu_display_->update_end();
+ }
+
+ render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+ static const int kLogLevel = 3;
+
+ if (!render_work.rebalance) {
+ return;
+ }
+
+ const int num_works = path_trace_works_.size();
+
+ if (num_works == 1) {
+ VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+ return;
+ }
+
+ const double start_time = time_dt();
+
+ if (VLOG_IS_ON(kLogLevel)) {
+ VLOG(kLogLevel) << "Perform rebalance work.";
+ VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+ for (int i = 0; i < num_works; ++i) {
+ VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+ << work_balance_infos_[i].time_spent;
+ }
+ }
+
+ const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+ if (VLOG_IS_ON(kLogLevel)) {
+ VLOG(kLogLevel) << "Calculated per-device weights for works:";
+ for (int i = 0; i < num_works; ++i) {
+ VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+ << work_balance_infos_[i].weight;
+ }
+ }
+
+ if (!did_rebalance) {
+ VLOG(kLogLevel) << "Balance in path trace works did not change.";
+ render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+ return;
+ }
+
+ RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+ big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+ copy_to_render_buffers(&big_tile_cpu_buffers);
+
+ render_state_.need_reset_params = true;
+ update_work_buffer_params_if_needed(render_work);
+
+ copy_from_render_buffers(&big_tile_cpu_buffers);
+
+ render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+ if (!render_work.tile.write) {
+ return;
+ }
+
+ VLOG(3) << "Write tile result.";
+
+ render_state_.tile_written = true;
+
+ const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+ /* Write render tile result, but only if not using tiled rendering.
+ *
+ * Tiles are written to a file during rendering, and written to the software at the end
+ * of rendering (wither when all tiles are finished, or when rendering was requested to be
+ * canceled).
+ *
+ * Important thing is: tile should be written to the software via callback only once. */
+ if (!has_multiple_tiles) {
+ VLOG(3) << "Write tile result via buffer write callback.";
+ tile_buffer_write();
+ }
+
+ /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+ */
+ if (has_multiple_tiles) {
+ VLOG(3) << "Write tile result into .";
+ tile_buffer_write_to_disk();
+ }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+ if (!render_work.full.write) {
+ return;
+ }
+
+ VLOG(3) << "Handle full-frame render buffer work.";
+
+ if (!tile_manager_.has_written_tiles()) {
+ VLOG(3) << "No tiles on disk.";
+ return;
+ }
+
+ /* Make sure writing to the file is fully finished.
+ * This will include writing all possible missing tiles, ensuring validness of the file. */
+ tile_manager_.finish_write_tiles();
+
+ /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+ * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+ * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+ thread_scoped_lock lock(render_cancel_.mutex);
+
+ render_cancel_.is_requested = true;
+
+ while (render_cancel_.is_rendering) {
+ render_cancel_.condition.wait(lock);
+ }
+
+ render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+ return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+ if (render_cancel_.is_requested) {
+ return true;
+ }
+
+ if (progress_ != nullptr) {
+ if (progress_->get_cancel()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+ if (!tile_buffer_write_cb) {
+ return;
+ }
+
+ tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+ if (!tile_buffer_read_cb) {
+ return;
+ }
+
+ if (tile_buffer_read_cb()) {
+ tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_render_buffers_to_device();
+ });
+ }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+ /* Sample count pass is required to support per-tile partial results stored in the file. */
+ DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+ const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+ if (num_rendered_samples == 0) {
+ /* The tile has zero samples, no need to write it. */
+ return;
+ }
+
+ /* Get access to the CPU-side render buffers of the current big tile. */
+ RenderBuffers *buffers;
+ RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+ if (path_trace_works_.size() == 1) {
+ path_trace_works_[0]->copy_render_buffers_from_device();
+ buffers = path_trace_works_[0]->get_render_buffers();
+ }
+ else {
+ big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+ copy_to_render_buffers(&big_tile_cpu_buffers);
+
+ buffers = &big_tile_cpu_buffers;
+ }
+
+ if (!tile_manager_.write_tile(*buffers)) {
+ LOG(ERROR) << "Error writing tile to file.";
+ }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+ if (progress_ != nullptr) {
+ const int2 tile_size = get_render_tile_size();
+ const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+ const int current_sample = render_work.path_trace.start_sample +
+ render_work.path_trace.num_samples;
+ progress_->add_samples(num_samples_added, current_sample);
+ }
+
+ if (progress_update_cb) {
+ progress_update_cb();
+ }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+ if (progress_ != nullptr) {
+ progress_->set_status(status, substatus);
+ }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+ tbb::parallel_for_each(path_trace_works_,
+ [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_to_render_buffers(render_buffers);
+ });
+ render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+ render_buffers->copy_from_device();
+ tbb::parallel_for_each(path_trace_works_,
+ [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+ path_trace_work->copy_from_render_buffers(render_buffers);
+ });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+ if (full_frame_state_.render_buffers) {
+ /* Full-frame buffer is always allocated on CPU. */
+ return true;
+ }
+
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->copy_render_buffers_from_device()) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+ string result;
+
+ if (buffers.params.layer.size()) {
+ result += string(buffers.params.layer);
+ }
+
+ if (buffers.params.view.size()) {
+ if (!result.empty()) {
+ result += ", ";
+ }
+ result += string(buffers.params.view);
+ }
+
+ return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+ VLOG(3) << "Processing full frame buffer file " << filename;
+
+ progress_set_status("Reading full buffer from disk");
+
+ RenderBuffers full_frame_buffers(cpu_device_.get());
+
+ DenoiseParams denoise_params;
+ if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+ LOG(ERROR) << "Error reading tiles from file.";
+ return;
+ }
+
+ const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+ render_state_.has_denoised_result = false;
+
+ if (denoise_params.use) {
+ progress_set_status(layer_view_name, "Denoising");
+
+ /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+ *
+ * It will not conflict with the regular rendering as:
+ * - Rendering is supposed to be finished here.
+ * - The next rendering will go via Session's `run_update_for_next_iteration` which will
+ * ensure proper denoiser is used. */
+ set_denoiser_params(denoise_params);
+
+ /* Number of samples doesn't matter too much, since the samples count pass will be used. */
+ denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+ render_state_.has_denoised_result = true;
+ }
+
+ full_frame_state_.render_buffers = &full_frame_buffers;
+
+ progress_set_status(layer_view_name, "Finishing");
+
+ /* Write the full result pretending that there is a single tile.
+ * Requires some state change, but allows to use same communication API with the software. */
+ tile_buffer_write();
+
+ full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+ if (full_frame_state_.render_buffers) {
+ return full_frame_state_.render_buffers->params.samples;
+ }
+
+ return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination)
+{
+ if (full_frame_state_.render_buffers) {
+ return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+ }
+
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+ const PassAccessor::Source &source)
+{
+ bool success = true;
+
+ tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+ if (!success) {
+ return;
+ }
+ if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+ success = false;
+ }
+ });
+
+ return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+ if (full_frame_state_.render_buffers) {
+ return make_int2(full_frame_state_.render_buffers->params.width,
+ full_frame_state_.render_buffers->params.height);
+ }
+
+ const Tile &tile = tile_manager_.get_current_tile();
+ return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+ if (full_frame_state_.render_buffers) {
+ return make_int2(0, 0);
+ }
+
+ const Tile &tile = tile_manager_.get_current_tile();
+ return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+ if (full_frame_state_.render_buffers) {
+ return full_frame_state_.render_buffers->params;
+ }
+
+ return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+ return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+ switch (type) {
+ case DEVICE_NONE:
+ return "None";
+
+ case DEVICE_CPU:
+ return "CPU";
+ case DEVICE_CUDA:
+ return "CUDA";
+ case DEVICE_OPTIX:
+ return "OptiX";
+ case DEVICE_DUMMY:
+ return "Dummy";
+ case DEVICE_MULTI:
+ return "Multi";
+ }
+
+ return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+ string full_description = device_info.description;
+
+ full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+ if (device_info.display_device) {
+ full_description += " (display)";
+ }
+
+ if (device_info.type == DEVICE_CPU) {
+ full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+ }
+
+ full_description += " [" + device_info.id + "]";
+
+ return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ * Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ * Message: Full First Device Description
+ * Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+ string result = "\n" + message + ": ";
+ const string pad(message.length() + 2, ' ');
+
+ if (device_info.multi_devices.empty()) {
+ result += full_device_info_description(device_info) + "\n";
+ return result;
+ }
+
+ bool is_first = true;
+ for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+ if (!is_first) {
+ result += pad;
+ }
+
+ result += full_device_info_description(sub_device_info) + "\n";
+
+ is_first = false;
+ }
+
+ return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+ DeviceInfo device_info;
+ device_info.type = DEVICE_MULTI;
+
+ for (auto &&path_trace_work : path_trace_works) {
+ device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+ }
+
+ return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+ if (!denoiser) {
+ return "";
+ }
+
+ if (!denoiser->get_params().use) {
+ return "";
+ }
+
+ const Device *denoiser_device = denoiser->get_denoiser_device();
+ if (!denoiser_device) {
+ return "";
+ }
+
+ return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+ string result = "\nFull path tracing report\n";
+
+ result += path_trace_devices_report(path_trace_works_);
+ result += denoiser_device_report(denoiser_.get());
+
+ /* Report from the render scheduler, which includes:
+ * - Render mode (interactive, offline, headless)
+ * - Adaptive sampling and denoiser parameters
+ * - Breakdown of timing. */
+ result += render_scheduler_.full_report();
+
+ return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..fc7713e6df9
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ * - Kernel graph.
+ * - Scheduling logic.
+ * - Queues management.
+ * - Adaptive stopping. */
+class PathTrace {
+ public:
+ /* Render scheduler is used to report timing information and access things like start/finish
+ * sample. */
+ PathTrace(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ RenderScheduler &render_scheduler,
+ TileManager &tile_manager);
+ ~PathTrace();
+
+ /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+ * The progress is reported to the currently configure progress object (via `set_progress`). */
+ void load_kernels();
+
+ /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+ * more accurately which scene device memory may need to allocated on the host. */
+ void alloc_work_memory();
+
+ /* Check whether now it is a good time to reset rendering.
+ * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+ * render result. */
+ bool ready_to_reset();
+
+ void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+ void device_free();
+
+ /* Set progress tracker.
+ * Used to communicate details about the progress to the outer world, check whether rendering is
+ * to be canceled.
+ *
+ * The path tracer writes to this object, and then at a convenient moment runs
+ * progress_update_cb() callback. */
+ void set_progress(Progress *progress);
+
+ /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+ * rendered (or until rendering is requested to be canceled). */
+ void render(const RenderWork &render_work);
+
+ /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+ * convenient to have it here because then its easy to access render buffer. But the downside is
+ * that this adds too much of entities which can live separately with some clear API. */
+
+ /* Set denoiser parameters.
+ * Use this to configure the denoiser before rendering any samples. */
+ void set_denoiser_params(const DenoiseParams &params);
+
+ /* Set parameters used for adaptive sampling.
+ * Use this to configure the adaptive sampler before rendering any samples. */
+ void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+ /* Set GPU display which takes care of drawing the render result. */
+ void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+ /* Clear the GPU display by filling it in with all zeroes. */
+ void clear_gpu_display();
+
+ /* Perform drawing of the current state of the GPUDisplay. */
+ void draw();
+
+ /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+ * Used in cases like reset of render session.
+ *
+ * This is a blocking call, which returns as soon as there is no running `render_samples()` call.
+ */
+ void cancel();
+
+ /* Copy an entire render buffer to/from the path trace. */
+
+ /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+ * the data will be copied to the device of the given render buffers. */
+ void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+ /* Copy happens via CPU side buffer: data will be copied from the device of the given render
+ * buffers and will be copied to all devices of the path trace. */
+ void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+ /* Copy render buffers of the big tile from the device to host.
+ * Return true if all copies are successful. */
+ bool copy_render_tile_from_device();
+
+ /* Read given full-frame file from disk, perform needed processing and write it to the software
+ * via the write callback. */
+ void process_full_buffer_from_disk(string_view filename);
+
+ /* Get number of samples in the current big tile render buffers. */
+ int get_num_render_tile_samples() const;
+
+ /* Get pass data of the entire big tile.
+ * This call puts pass render result from all devices into the final pixels storage.
+ *
+ * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+ *
+ * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+ bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination);
+
+ /* Set pass data for baking. */
+ bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+ /* Check whether denoiser was run and denoised passes are available. */
+ bool has_denoised_result() const;
+
+ /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+ * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+ *
+ * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+ * instead. */
+ int2 get_render_tile_size() const;
+ int2 get_render_tile_offset() const;
+
+ /* Get buffer parameters of the current tile.
+ *
+ * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+ * instead. */
+ const BufferParams &get_render_tile_params() const;
+
+ /* Generate full multi-line report of the rendering process, including rendering parameters,
+ * times, and so on. */
+ string full_report() const;
+
+ /* Callback which communicates an updates state of the render buffer of the current big tile.
+ * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+ function<void(void)> tile_buffer_update_cb;
+
+ /* Callback which communicates final rendered buffer. Is called after path-tracing is done. */
+ function<void(void)> tile_buffer_write_cb;
+
+ /* Callback which initializes rendered buffer. Is called before path-tracing starts.
+ *
+ * This is used for baking. */
+ function<bool(void)> tile_buffer_read_cb;
+
+ /* Callback which is called to report current rendering progress.
+ *
+ * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+ * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+ * that the buffer is "uniformly" sampled at the moment of this callback). */
+ function<void(void)> progress_update_cb;
+
+ protected:
+ /* Actual implementation of the rendering pipeline.
+ * Calls steps in order, checking for the cancel to be requested in between.
+ *
+ * Is separate from `render()` to simplify dealing with the early outputs and keeping
+ * `render_cancel_` in the consistent state. */
+ void render_pipeline(RenderWork render_work);
+
+ /* Initialize kernel execution on all integrator queues. */
+ void render_init_kernel_execution();
+
+ /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+ * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+ */
+ void update_work_buffer_params_if_needed(const RenderWork &render_work);
+ void update_allocated_work_buffer_params();
+ void update_effective_work_buffer_params(const RenderWork &render_work);
+
+ /* Perform various steps of the render work.
+ *
+ * Note that some steps might modify the work, forcing some steps to happen within this iteration
+ * of rendering. */
+ void init_render_buffers(const RenderWork &render_work);
+ void path_trace(RenderWork &render_work);
+ void adaptive_sample(RenderWork &render_work);
+ void denoise(const RenderWork &render_work);
+ void cryptomatte_postprocess(const RenderWork &render_work);
+ void update_display(const RenderWork &render_work);
+ void rebalance(const RenderWork &render_work);
+ void write_tile_buffer(const RenderWork &render_work);
+ void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+ /* Get number of samples in the current state of the render buffers. */
+ int get_num_samples_in_buffer();
+
+ /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+ * soon as possible. */
+ bool is_cancel_requested();
+
+ /* Write the big tile render buffer via the write callback. */
+ void tile_buffer_write();
+
+ /* Read the big tile render buffer via the read callback. */
+ void tile_buffer_read();
+
+ /* Write current tile into the file on disk. */
+ void tile_buffer_write_to_disk();
+
+ /* Run the progress_update_cb callback if it is needed. */
+ void progress_update_if_needed(const RenderWork &render_work);
+
+ void progress_set_status(const string &status, const string &substatus = "");
+
+ /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+ * are configured this is a `MultiDevice`. */
+ Device *device_ = nullptr;
+
+ /* CPU device for creating temporary render buffers on the CPU side. */
+ unique_ptr<Device> cpu_device_;
+
+ DeviceScene *device_scene_;
+
+ RenderScheduler &render_scheduler_;
+ TileManager &tile_manager_;
+
+ unique_ptr<GPUDisplay> gpu_display_;
+
+ /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+ * device. */
+ vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+ /* Per-path trace work information needed for multi-device balancing. */
+ vector<WorkBalanceInfo> work_balance_infos_;
+
+ /* Render buffer parameters of the full frame and current big tile. */
+ BufferParams full_params_;
+ BufferParams big_tile_params_;
+
+ /* Denoiser which takes care of denoising the big tile. */
+ unique_ptr<Denoiser> denoiser_;
+
+ /* State which is common for all the steps of the render work.
+ * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+ * rendering the work. */
+ struct {
+ /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+ * value of the big tile parameters. */
+ bool need_reset_params = false;
+
+ /* Divider of the resolution for faster previews.
+ *
+ * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+ * think of render buffer in this case is as an over-allocated array: the resolution divider
+ * affects both resolution and stride as visible by the integrator kernels. */
+ int resolution_divider = 0;
+
+ /* Parameters of the big tile with the current resolution divider applied. */
+ BufferParams effective_big_tile_params;
+
+ /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+ bool has_denoised_result = false;
+
+ /* Current tile has been written (to either disk or callback.
+ * Indicates that no more work will be done on this tile. */
+ bool tile_written = false;
+ } render_state_;
+
+ /* Progress object which is used to communicate sample progress. */
+ Progress *progress_;
+
+ /* Fields required for canceling render on demand, as quickly as possible. */
+ struct {
+ /* Indicates whether there is an on-going `render_samples()` call. */
+ bool is_rendering = false;
+
+ /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+ bool is_requested = false;
+
+ /* Synchronization between thread which does `render_samples()` and thread which does
+ * `cancel()`. */
+ thread_mutex mutex;
+ thread_condition_variable condition;
+ } render_cancel_;
+
+ /* Indicates whether a render result was drawn after latest session reset.
+ * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+ bool did_draw_after_reset_ = true;
+
+ /* State of the full frame processing and writing to the software. */
+ struct {
+ RenderBuffers *render_buffers = nullptr;
+ } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+{
+ if (device->info.type == DEVICE_CPU) {
+ return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+ }
+
+ return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : device_(device),
+ film_(film),
+ device_scene_(device_scene),
+ buffers_(make_unique<RenderBuffers>(device)),
+ effective_buffer_params_(buffers_->params),
+ cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+ return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+ const BufferParams &effective_big_tile_params,
+ const BufferParams &effective_buffer_params)
+{
+ effective_full_params_ = effective_full_params;
+ effective_big_tile_params_ = effective_big_tile_params;
+ effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+ /* Assume if there are multiple works working on the same big tile none of the works gets the
+ * entire big tile to work on. */
+ return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+ effective_big_tile_params_.height == effective_buffer_params_.height &&
+ effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+ effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+ copy_render_buffers_from_device();
+
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t height = effective_buffer_params_.height;
+ const int64_t pass_stride = effective_buffer_params_.pass_stride;
+ const int64_t row_stride = width * pass_stride;
+ const int64_t data_size = row_stride * height * sizeof(float);
+
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset_in_floats = offset_y * row_stride;
+
+ const float *src = buffers_->buffer.data();
+ float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+ memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t height = effective_buffer_params_.height;
+ const int64_t pass_stride = effective_buffer_params_.pass_stride;
+ const int64_t row_stride = width * pass_stride;
+ const int64_t data_size = row_stride * height * sizeof(float);
+
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset_in_floats = offset_y * row_stride;
+
+ const float *src = render_buffers->buffer.data() + offset_in_floats;
+ float *dst = buffers_->buffer.data();
+
+ memcpy(dst, src, data_size);
+
+ copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+ const int64_t width = effective_buffer_params_.width;
+ const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int64_t offset = offset_y * width;
+
+ render_buffers_host_copy_denoised(
+ buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+ copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination)
+{
+ const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int width = effective_buffer_params_.width;
+
+ PassAccessor::Destination slice_destination = destination;
+ slice_destination.offset += offset_y * width;
+
+ return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+ const PassAccessor::Source &source)
+{
+ const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+ const int width = effective_buffer_params_.width;
+
+ PassAccessor::Source slice_source = source;
+ slice_source.offset += offset_y * width;
+
+ return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+ const KernelFilm &kfilm = device_scene_->data.film;
+ const KernelBackground &kbackground = device_scene_->data.background;
+
+ const BufferParams &params = buffers_->params;
+
+ const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+ PassAccessor::PassAccessInfo pass_access_info;
+ pass_access_info.type = display_pass->type;
+ pass_access_info.offset = PASS_UNUSED;
+
+ if (pass_mode == PassMode::DENOISED) {
+ pass_access_info.mode = PassMode::DENOISED;
+ pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+ }
+
+ if (pass_access_info.offset == PASS_UNUSED) {
+ pass_access_info.mode = PassMode::NOISY;
+ pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+ }
+
+ pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+ pass_access_info.use_approximate_shadow_catcher_background =
+ kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+ return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+ const GPUDisplay *gpu_display) const
+{
+ PassAccessor::Destination destination(film_->get_display_pass());
+
+ const int2 display_texture_size = gpu_display->get_texture_size();
+ const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+ const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+ destination.offset = texture_y * display_texture_size.x + texture_x;
+ destination.stride = display_texture_size.x;
+
+ return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..8c9c8811199
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+ struct RenderStatistics {
+ float occupancy = 1.0f;
+ };
+
+ /* Create path trace work which fits best the device.
+ *
+ * The cancel request flag is used for a cheap check whether cancel is to be performed as soon as
+ * possible. This could be, for example, request to cancel rendering on camera navigation in
+ * viewport. */
+ static unique_ptr<PathTraceWork> create(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual ~PathTraceWork();
+
+ /* Access the render buffers.
+ *
+ * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+ * correspond to the big tile size and relative device performance. */
+ RenderBuffers *get_render_buffers();
+
+ /* Set effective parameters of the big tile and the work itself. */
+ void set_effective_buffer_params(const BufferParams &effective_full_params,
+ const BufferParams &effective_big_tile_params,
+ const BufferParams &effective_buffer_params);
+
+ /* Check whether the big tile is being worked on by multiple path trace works. */
+ bool has_multiple_works() const;
+
+ /* Allocate working memory for execution. Must be called before init_execution(). */
+ virtual void alloc_work_memory(){};
+
+ /* Initialize execution of kernels.
+ * Will ensure that all device queues are initialized for execution.
+ *
+ * This method is to be called after any change in the scene. It is not needed to call it prior
+ * to an every call of the `render_samples()`. */
+ virtual void init_execution() = 0;
+
+ /* Render given number of samples as a synchronous blocking call.
+ * The samples are added to the render buffer associated with this work. */
+ virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+ /* Copy render result from this work to the corresponding place of the GPU display.
+ *
+ * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+ * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+ * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+ * not used then this function will fall-back to the noisy pass instead. */
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) = 0;
+
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+ /* Copy data from/to given render buffers.
+ * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+ * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+ /* Notes:
+ * - Copies work's render buffer from the device.
+ * - Copies CPU-side buffer of the given buffer
+ * - Does not copy the buffer to its device. */
+ void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+ /* Notes:
+ * - Does not copy given render buffers from the device.
+ * - Copies work's render buffer to its device. */
+ void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+ /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+ * given render buffers, leaving rest of the passes.
+ *
+ * Same notes about device copying applies to this call as well. */
+ void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+ /* Copy render buffers to/from device using an appropriate device queue when needed so that
+ * things are executed in order with the `render_samples()`. */
+ virtual bool copy_render_buffers_from_device() = 0;
+ virtual bool copy_render_buffers_to_device() = 0;
+
+ /* Zero render buffers to/from device using an appropriate device queue when needed so that
+ * things are executed in order with the `render_samples()`. */
+ virtual bool zero_render_buffers() = 0;
+
+ /* Access pixels rendered by this work and copy them to the corresponding location in the
+ * destination.
+ *
+ * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+ * to update host-side data. */
+ bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+ const PassAccessor::Destination &destination);
+
+ /* Set pass data for baking. */
+ bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+ /* Perform convergence test on the render buffer, and filter the convergence mask.
+ * Returns number of active pixels (the ones which did not converge yet). */
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+ /* Run cryptomatte pass post-processing kernels. */
+ virtual void cryptomatte_postproces() = 0;
+
+ /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+ * possible, without waiting for any samples to be finished. */
+ inline bool is_cancel_requested() const
+ {
+ /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+ * threaded environment. */
+ return *cancel_requested_flag_;
+ }
+
+ /* Access to the device which is used to path trace this work on. */
+ Device *get_device() const
+ {
+ return device_;
+ }
+
+ protected:
+ PathTraceWork(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+ /* Get destination which offset and stride are configured so that writing to it will write to a
+ * proper location of GPU display texture, taking current tile and device slice into account. */
+ PassAccessor::Destination get_gpu_display_destination_template(
+ const GPUDisplay *gpu_display) const;
+
+ /* Device which will be used for path tracing.
+ * Note that it is an actual render device (and never is a multi-device). */
+ Device *device_;
+
+ /* Film is used to access display pass configuration for GPU display update.
+ * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+ Film *film_;
+
+ /* Device side scene storage, that may be used for integrator logic. */
+ DeviceScene *device_scene_;
+
+ /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+ * tile which is being rendered by this work.
+ * It also defines possible subset of a big tile in the case of multi-device rendering. */
+ unique_ptr<RenderBuffers> buffers_;
+
+ /* Effective parameters of the full, big tile, and current work render buffer.
+ * The latter might be different from `buffers_->params` when there is a resolution divider
+ * involved. */
+ BufferParams effective_full_params_;
+ BufferParams effective_big_tile_params_;
+ BufferParams effective_buffer_params_;
+
+ bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+ /* TODO: limit this to number of threads of CPU device, it may be smaller than
+ * the system number of threads when we reduce the number of CPU threads in
+ * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+ return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+ vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+ const int thread_index = tbb::this_task_arena::current_thread_index();
+ DCHECK_GE(thread_index, 0);
+ DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+ return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+ kernels_(*(device->get_cpu_kernels()))
+{
+ DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+ /* Cache per-thread kernel globals. */
+ device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
+{
+ const int64_t image_width = effective_buffer_params_.width;
+ const int64_t image_height = effective_buffer_params_.height;
+ const int64_t total_pixels_num = image_width * image_height;
+
+ for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+ kernel_globals.start_profiling();
+ }
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+ local_arena.execute([&]() {
+ tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+ if (is_cancel_requested()) {
+ return;
+ }
+
+ const int y = work_index / image_width;
+ const int x = work_index - y * image_width;
+
+ KernelWorkTile work_tile;
+ work_tile.x = effective_buffer_params_.full_x + x;
+ work_tile.y = effective_buffer_params_.full_y + y;
+ work_tile.w = 1;
+ work_tile.h = 1;
+ work_tile.start_sample = start_sample;
+ work_tile.num_samples = 1;
+ work_tile.offset = effective_buffer_params_.offset;
+ work_tile.stride = effective_buffer_params_.stride;
+
+ CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+ render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+ });
+ });
+
+ for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+ kernel_globals.stop_profiling();
+ }
+
+ statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+ const KernelWorkTile &work_tile,
+ const int samples_num)
+{
+ const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+ const bool has_bake = device_scene_->data.bake.use;
+
+ IntegratorStateCPU integrator_states[2] = {};
+
+ IntegratorStateCPU *state = &integrator_states[0];
+ IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+ KernelWorkTile sample_work_tile = work_tile;
+ float *render_buffer = buffers_->buffer.data();
+
+ for (int sample = 0; sample < samples_num; ++sample) {
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ if (has_bake) {
+ if (!kernels_.integrator_init_from_bake(
+ kernel_globals, state, &sample_work_tile, render_buffer)) {
+ break;
+ }
+ }
+ else {
+ if (!kernels_.integrator_init_from_camera(
+ kernel_globals, state, &sample_work_tile, render_buffer)) {
+ break;
+ }
+ }
+
+ kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+ if (has_shadow_catcher) {
+ kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+ }
+
+ ++sample_work_tile.start_sample;
+ }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ half4 *rgba_half = gpu_display->map_texture_buffer();
+ if (!rgba_half) {
+ /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+ * some implementations of GPUDisplay which can not map memory? */
+ return;
+ }
+
+ const KernelFilm &kfilm = device_scene_->data.film;
+
+ const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+ const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+ PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+ destination.pixels_half_rgba = rgba_half;
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+ local_arena.execute([&]() {
+ pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+ });
+
+ gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+ return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+ buffers_->buffer.copy_to_device();
+ return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+ buffers_->zero();
+ return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+ const int full_x = effective_buffer_params_.full_x;
+ const int full_y = effective_buffer_params_.full_y;
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+ const int offset = effective_buffer_params_.offset;
+ const int stride = effective_buffer_params_.stride;
+
+ float *render_buffer = buffers_->buffer.data();
+
+ uint num_active_pixels = 0;
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+ /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+ local_arena.execute([&]() {
+ tbb::parallel_for(full_y, full_y + height, [&](int y) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+ bool row_converged = true;
+ uint num_row_pixels_active = 0;
+ for (int x = 0; x < width; ++x) {
+ if (!kernels_.adaptive_sampling_convergence_check(
+ kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+ ++num_row_pixels_active;
+ row_converged = false;
+ }
+ }
+
+ atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+ if (!row_converged) {
+ kernels_.adaptive_sampling_filter_x(
+ kernel_globals, render_buffer, y, full_x, width, offset, stride);
+ }
+ });
+ });
+
+ if (num_active_pixels) {
+ local_arena.execute([&]() {
+ tbb::parallel_for(full_x, full_x + width, [&](int x) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+ kernels_.adaptive_sampling_filter_y(
+ kernel_globals, render_buffer, x, full_y, height, offset, stride);
+ });
+ });
+ }
+
+ return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+
+ float *render_buffer = buffers_->buffer.data();
+
+ tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+ /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+ local_arena.execute([&]() {
+ tbb::parallel_for(0, height, [&](int y) {
+ CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+ int pixel_index = y * width;
+
+ for (int x = 0; x < width; ++x, ++pixel_index) {
+ kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+ }
+ });
+ });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+ PathTraceWorkCPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual void init_execution() override;
+
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
+
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) override;
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+ virtual bool copy_render_buffers_from_device() override;
+ virtual bool copy_render_buffers_to_device() override;
+ virtual bool zero_render_buffers() override;
+
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+ virtual void cryptomatte_postproces() override;
+
+ protected:
+ /* Core path tracing routine. Renders given work time on the given queue. */
+ void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+ const KernelWorkTile &work_tile,
+ const int samples_num);
+
+ /* CPU kernels. */
+ const CPUKernels &kernels_;
+
+ /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+ *
+ * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+ * accessing it, but some "localization" is required to decouple from kernel globals stored
+ * on the device level. */
+ vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..135466becc6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag)
+ : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+ queue_(device->gpu_queue_create()),
+ integrator_state_soa_kernel_features_(0),
+ integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+ integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+ integrator_shader_raytrace_sort_counter_(
+ device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+ integrator_next_shadow_catcher_path_index_(
+ device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+ queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+ num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+ work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+ gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+ max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+ min_num_active_paths_(queue_->num_concurrent_busy_states()),
+ max_active_path_index_(0)
+{
+ memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+ /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+ * path compaction which relies on the fact that regeneration does not happen sooner than half of
+ * the states are available again. */
+ min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+ /* IntegrateState allocated as structure of arrays. */
+
+ /* Check if we already allocated memory for the required features. */
+ const uint kernel_features = device_scene_->data.kernel_features;
+ if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+ return;
+ }
+ integrator_state_soa_kernel_features_ = kernel_features;
+
+ /* Allocate a device only memory buffer before for each struct member, and then
+ * write the pointers into a struct that resides in constant memory.
+ *
+ * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && \
+ (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_END(name) \
+ break; \
+ }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+ if (array_index == array_size - 1) { \
+ break; \
+ } \
+ }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+ if (integrator_queue_counter_.size() == 0) {
+ integrator_queue_counter_.alloc(1);
+ integrator_queue_counter_.zero_to_device();
+ integrator_queue_counter_.copy_from_device();
+ integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+ integrator_queue_counter_.device_pointer;
+ }
+
+ /* Allocate data for active path index arrays. */
+ if (num_queued_paths_.size() == 0) {
+ num_queued_paths_.alloc(1);
+ num_queued_paths_.zero_to_device();
+ }
+
+ if (queued_paths_.size() == 0) {
+ queued_paths_.alloc(max_num_paths_);
+ /* TODO: this could be skip if we had a function to just allocate on device. */
+ queued_paths_.zero_to_device();
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+ /* Allocate arrays for shader sorting. */
+ const int max_shaders = device_scene_->data.max_shaders;
+ if (integrator_shader_sort_counter_.size() < max_shaders) {
+ integrator_shader_sort_counter_.alloc(max_shaders);
+ integrator_shader_sort_counter_.zero_to_device();
+
+ integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+ integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+ (int *)integrator_shader_sort_counter_.device_pointer;
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+ (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+ }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+ if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+ return;
+ }
+
+ integrator_next_shadow_catcher_path_index_.alloc(1);
+ /* TODO(sergey): Use queue? */
+ integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+ integrator_state_gpu_.next_shadow_catcher_path_index =
+ (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+ alloc_integrator_soa();
+ alloc_integrator_queue();
+ alloc_integrator_sorting();
+ alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+ queue_->init_execution();
+
+ /* Copy to device side struct in constant memory. */
+ device_->const_copy_to(
+ "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
+{
+ /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+ * add more work (because tiles are smaller, so there is higher chance that more paths will
+ * become busy after adding new tiles). This is especially important for the shadow catcher which
+ * schedules work in halves of available number of paths. */
+ work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+ work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+ enqueue_reset();
+
+ int num_iterations = 0;
+ uint64_t num_busy_accum = 0;
+
+ /* TODO: set a hard limit in case of undetected kernel failures? */
+ while (true) {
+ /* Enqueue work from the scheduler, on start or when there are not enough
+ * paths to keep the device occupied. */
+ bool finished;
+ if (enqueue_work_tiles(finished)) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ /* Stop if no more work remaining. */
+ if (finished) {
+ break;
+ }
+
+ /* Enqueue on of the path iteration kernels. */
+ if (enqueue_path_iteration()) {
+ /* Copy stats from the device. */
+ queue_->copy_from_device(integrator_queue_counter_);
+
+ if (!queue_->synchronize()) {
+ break; /* Stop on error. */
+ }
+ }
+
+ if (is_cancel_requested()) {
+ break;
+ }
+
+ num_busy_accum += get_num_active_paths();
+ ++num_iterations;
+ }
+
+ statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int max_num_queued = 0;
+ DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ if (queue_counter->num_queued[i] > max_num_queued) {
+ kernel = (DeviceKernel)i;
+ max_num_queued = queue_counter->num_queued[i];
+ }
+ }
+
+ return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+ void *args[] = {&max_num_paths_};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+ queue_->zero_to_device(integrator_queue_counter_);
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+ /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+ * counter on the host side because `zero_to_device()` is not doing it. */
+ if (integrator_queue_counter_.host_pointer) {
+ memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+ }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+ /* Find kernel to execute, with max number of queued paths. */
+ const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_active_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ num_active_paths += queue_counter->num_queued[i];
+ }
+
+ if (num_active_paths == 0) {
+ return false;
+ }
+
+ /* Find kernel to execute, with max number of queued paths. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel == DEVICE_KERNEL_NUM) {
+ return false;
+ }
+
+ /* Finish shadows before potentially adding more shadow rays. We can only
+ * store one shadow ray in the integrator state. */
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+ if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ return true;
+ }
+ else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+ enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return true;
+ }
+ }
+
+ /* Schedule kernel with maximum number of queued items. */
+ enqueue_path_iteration(kernel);
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+ void *d_path_index = (void *)NULL;
+
+ /* Create array of path indices for which this kernel is queued to be executed. */
+ int work_size = max_active_path_index_;
+
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+ int num_queued = queue_counter->num_queued[kernel];
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ /* Compute array of active paths, sorted by shader. */
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+ }
+ else if (num_queued < work_size) {
+ work_size = num_queued;
+ d_path_index = (void *)queued_paths_.device_pointer;
+
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Compute array of active shadow paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+ }
+ else {
+ /* Compute array of active paths for specific kernel. */
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+ }
+ }
+
+ DCHECK_LE(work_size, max_num_paths_);
+
+ switch (kernel) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+ /* Ray intersection kernels with integrator state. */
+ void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+ /* Shading kernels with integrator state and render buffer. */
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+ void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+ queue_->enqueue(kernel, work_size, args);
+ break;
+ }
+
+ default:
+ LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+ << " used for path iteration, should never happen.";
+ break;
+ }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+ void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+ assert(d_counter != nullptr);
+
+ /* Compute prefix sum of number of active paths with each shader. */
+ {
+ const int work_size = 1;
+ int max_shaders = device_scene_->data.max_shaders;
+ void *args[] = {&d_counter, &max_shaders};
+ queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ /* Launch kernel to fill the active paths arrays. */
+ {
+ /* TODO: this could be smaller for terminated paths based on amount of work we want
+ * to schedule. */
+ const int work_size = max_active_path_index_;
+
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size),
+ &d_queued_paths,
+ &d_num_queued_paths,
+ &d_counter,
+ &d_queued_kernel};
+
+ queue_->enqueue(kernel, work_size, args);
+ }
+
+ if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+ queue_->zero_to_device(integrator_shader_sort_counter_);
+ }
+ else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+ }
+ else {
+ assert(0);
+ }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+ int d_queued_kernel = queued_kernel;
+
+ /* Launch kernel to fill the active paths arrays. */
+ const int work_size = max_active_path_index_;
+ void *d_queued_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {
+ const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+ if (num_active_paths == 0) {
+ max_active_path_index_ = 0;
+ }
+
+ /* Compact fragmented path states into the start of the array, moving any paths
+ * with index higher than the number of active paths into the gaps. */
+ if (max_active_path_index_ == num_active_paths) {
+ return;
+ }
+
+ void *d_compact_paths = (void *)queued_paths_.device_pointer;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+ /* Create array with terminated paths that we can write to. */
+ {
+ /* TODO: can the work size be reduced here? */
+ int offset = num_active_paths;
+ int work_size = num_active_paths;
+ void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+ }
+
+ /* Create array of paths that we need to compact, where the path index is bigger
+ * than the number of active paths. */
+ {
+ int work_size = max_active_path_index_;
+ void *args[] = {
+ &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+ queue_->zero_to_device(num_queued_paths_);
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+ }
+
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ int num_compact_paths = num_queued_paths_.data()[0];
+
+ /* Move paths into gaps. */
+ if (num_compact_paths > 0) {
+ int work_size = num_compact_paths;
+ int active_states_offset = 0;
+ int terminated_states_offset = num_active_paths;
+ void *args[] = {
+ &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+ }
+
+ queue_->synchronize();
+
+ /* Adjust max active path index now we know which part of the array is actually used. */
+ max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+ /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+ * wavefront of the existing and newly added paths. */
+ /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+ * performance. */
+ const DeviceKernel kernel = get_most_queued_kernel();
+ if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+ return false;
+ }
+
+ int num_active_paths = get_num_active_paths();
+
+ /* Don't schedule more work if canceling. */
+ if (is_cancel_requested()) {
+ if (num_active_paths == 0) {
+ finished = true;
+ }
+ return false;
+ }
+
+ finished = false;
+
+ vector<KernelWorkTile> work_tiles;
+
+ int max_num_camera_paths = max_num_paths_;
+ int num_predicted_splits = 0;
+
+ if (has_shadow_catcher()) {
+ /* When there are shadow catchers in the scene bounce from them will split the state. So we
+ * make sure there is enough space in the path states array to fit split states.
+ *
+ * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+ * that all the new paths can be split.
+ *
+ * Note that it is possible that some of the current states can still split, so need to make
+ * sure there is enough space for them as well. */
+
+ /* Number of currently in-flight states which can still split. */
+ const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+ const int num_available_paths = max_num_paths_ - num_active_paths;
+ const int num_new_paths = num_available_paths / 2;
+ max_num_camera_paths = max(num_active_paths,
+ num_active_paths + num_new_paths - num_scheduled_possible_split);
+ num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+ }
+
+ /* Schedule when we're out of paths or there are too few paths to keep the
+ * device occupied. */
+ int num_paths = num_active_paths;
+ if (num_paths == 0 || num_paths < min_num_active_paths_) {
+ /* Get work tiles until the maximum number of path is reached. */
+ while (num_paths < max_num_camera_paths) {
+ KernelWorkTile work_tile;
+ if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+ work_tiles.push_back(work_tile);
+ num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* If we couldn't get any more tiles, we're done. */
+ if (work_tiles.size() == 0 && num_paths == 0) {
+ finished = true;
+ return false;
+ }
+ }
+
+ /* Initialize paths from work tiles. */
+ if (work_tiles.size() == 0) {
+ return false;
+ }
+
+ /* Compact state array when number of paths becomes small relative to the
+ * known maximum path index, which makes computing active index arrays slow. */
+ compact_states(num_active_paths);
+
+ if (has_shadow_catcher()) {
+ integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+ queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+ }
+
+ enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+ work_tiles.data(),
+ work_tiles.size(),
+ num_active_paths,
+ num_predicted_splits);
+
+ return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+ const KernelWorkTile work_tiles[],
+ const int num_work_tiles,
+ const int num_active_paths,
+ const int num_predicted_splits)
+{
+ /* Copy work tiles to device. */
+ if (work_tiles_.size() < num_work_tiles) {
+ work_tiles_.alloc(num_work_tiles);
+ }
+
+ int path_index_offset = num_active_paths;
+ int max_tile_work_size = 0;
+ for (int i = 0; i < num_work_tiles; i++) {
+ KernelWorkTile &work_tile = work_tiles_.data()[i];
+ work_tile = work_tiles[i];
+
+ const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+ work_tile.path_index_offset = path_index_offset;
+ work_tile.work_size = tile_work_size;
+
+ path_index_offset += tile_work_size;
+
+ max_tile_work_size = max(max_tile_work_size, tile_work_size);
+ }
+
+ queue_->copy_to_device(work_tiles_);
+
+ void *d_work_tiles = (void *)work_tiles_.device_pointer;
+ void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles,
+ const_cast<int *>(&num_work_tiles),
+ &d_render_buffer,
+ const_cast<int *>(&max_tile_work_size)};
+
+ queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+ max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+ /* TODO: this is wrong, does not account for duplicates with shadow! */
+ IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+ int num_paths = 0;
+ for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+ DCHECK_GE(queue_counter->num_queued[i], 0)
+ << "Invalid number of queued states for kernel "
+ << device_kernel_as_string(static_cast<DeviceKernel>(i));
+ num_paths += queue_counter->num_queued[i];
+ }
+
+ return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+ /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+ * that the GPUDisplay has a single texture:
+ *
+ * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+ * attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+ * otherwise one would run into a conflict of where the source of truth is. */
+ if (has_multiple_works()) {
+ return false;
+ }
+
+ if (!interop_use_checked_) {
+ Device *device = queue_->device;
+ interop_use_ = device->should_use_graphics_interop();
+
+ if (interop_use_) {
+ VLOG(2) << "Will be using graphics interop GPU display update.";
+ }
+ else {
+ VLOG(2) << "Will be using naive GPU display update.";
+ }
+
+ interop_use_checked_ = true;
+ }
+
+ return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (device_->have_error()) {
+ /* Don't attempt to update GPU display if the device has errors: the error state will make
+ * wrong decisions to happen about interop, causing more chained bugs. */
+ return;
+ }
+
+ if (!buffers_->buffer.device_pointer) {
+ LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+ return;
+ }
+
+ if (should_use_graphics_interop()) {
+ if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+ return;
+ }
+
+ /* If error happens when trying to use graphics interop fallback to the native implementation
+ * and don't attempt to use interop for the further updates. */
+ interop_use_ = false;
+ }
+
+ copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const int full_x = effective_buffer_params_.full_x;
+ const int full_y = effective_buffer_params_.full_y;
+ const int width = effective_buffer_params_.width;
+ const int height = effective_buffer_params_.height;
+ const int final_width = buffers_->params.width;
+ const int final_height = buffers_->params.height;
+
+ const int texture_x = full_x - effective_full_params_.full_x;
+ const int texture_y = full_y - effective_full_params_.full_y;
+
+ /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+ *
+ * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+ * change of the resolution divider. However, if the display becomes smaller, shrink the
+ * allocated memory as well. */
+ if (gpu_display_rgba_half_.data_width != final_width ||
+ gpu_display_rgba_half_.data_height != final_height) {
+ gpu_display_rgba_half_.alloc(final_width, final_height);
+ /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+ * transferring zeroes to the device. */
+ queue_->zero_to_device(gpu_display_rgba_half_);
+ }
+
+ PassAccessor::Destination destination(film_->get_display_pass());
+ destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ gpu_display_rgba_half_.copy_from_device();
+
+ gpu_display->copy_pixels_to_texture(
+ gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples)
+{
+ if (!device_graphics_interop_) {
+ device_graphics_interop_ = queue_->graphics_interop_create();
+ }
+
+ const DeviceGraphicsInteropDestination graphics_interop_dst =
+ gpu_display->graphics_interop_get();
+ device_graphics_interop_->set_destination(graphics_interop_dst);
+
+ const device_ptr d_rgba_half = device_graphics_interop_->map();
+ if (!d_rgba_half) {
+ return false;
+ }
+
+ PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+ destination.d_pixels_half_rgba = d_rgba_half;
+
+ get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+ device_graphics_interop_->unmap();
+
+ return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+ if (!device_graphics_interop_) {
+ return;
+ }
+ gpu_display->graphics_interop_activate();
+ device_graphics_interop_ = nullptr;
+ gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+ PassMode pass_mode,
+ int num_samples)
+{
+ const KernelFilm &kfilm = device_scene_->data.film;
+
+ const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+ const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+ pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+ const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+ if (num_active_pixels) {
+ enqueue_adaptive_sampling_filter_x();
+ enqueue_adaptive_sampling_filter_y();
+ queue_->synchronize();
+ }
+
+ return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+ device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+ num_active_pixels.alloc(1);
+
+ queue_->zero_to_device(num_active_pixels);
+
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&effective_buffer_params_.full_x),
+ const_cast<int *>(&effective_buffer_params_.full_y),
+ const_cast<int *>(&effective_buffer_params_.width),
+ const_cast<int *>(&effective_buffer_params_.height),
+ &threshold,
+ &reset,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride,
+ &num_active_pixels.device_pointer};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+ queue_->copy_from_device(num_active_pixels);
+ queue_->synchronize();
+
+ return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+ const int work_size = effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+ const int work_size = effective_buffer_params_.width;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ &effective_buffer_params_.full_x,
+ &effective_buffer_params_.full_y,
+ &effective_buffer_params_.width,
+ &effective_buffer_params_.height,
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+ const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+ void *args[] = {&buffers_->buffer.device_pointer,
+ const_cast<int *>(&work_size),
+ &effective_buffer_params_.offset,
+ &effective_buffer_params_.stride};
+
+ queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+ queue_->copy_from_device(buffers_->buffer);
+
+ /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+ return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+ queue_->copy_to_device(buffers_->buffer);
+
+ /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+ * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+ * which will perform synchronization as needed. */
+
+ return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+ queue_->zero_to_device(buffers_->buffer);
+
+ return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+ return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+ if (max_active_path_index_ == 0) {
+ return 0;
+ }
+
+ if (!has_shadow_catcher()) {
+ return 0;
+ }
+
+ queue_->zero_to_device(num_queued_paths_);
+
+ const int work_size = max_active_path_index_;
+ void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+ void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+ queue_->copy_from_device(num_queued_paths_);
+ queue_->synchronize();
+
+ return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+ PathTraceWorkGPU(Device *device,
+ Film *film,
+ DeviceScene *device_scene,
+ bool *cancel_requested_flag);
+
+ virtual void alloc_work_memory() override;
+ virtual void init_execution() override;
+
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
+
+ virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+ PassMode pass_mode,
+ int num_samples) override;
+ virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+ virtual bool copy_render_buffers_from_device() override;
+ virtual bool copy_render_buffers_to_device() override;
+ virtual bool zero_render_buffers() override;
+
+ virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+ virtual void cryptomatte_postproces() override;
+
+ protected:
+ void alloc_integrator_soa();
+ void alloc_integrator_queue();
+ void alloc_integrator_sorting();
+ void alloc_integrator_path_split();
+
+ /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+ DeviceKernel get_most_queued_kernel() const;
+
+ void enqueue_reset();
+
+ bool enqueue_work_tiles(bool &finished);
+ void enqueue_work_tiles(DeviceKernel kernel,
+ const KernelWorkTile work_tiles[],
+ const int num_work_tiles,
+ const int num_active_paths,
+ const int num_predicted_splits);
+
+ bool enqueue_path_iteration();
+ void enqueue_path_iteration(DeviceKernel kernel);
+
+ void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+ void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+ void compact_states(const int num_active_paths);
+
+ int get_num_active_paths();
+
+ /* Check whether graphics interop can be used for the GPUDisplay update. */
+ bool should_use_graphics_interop();
+
+ /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+ * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+ void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+ /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+ * functionality, avoiding copy of pixels to the host. */
+ bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+ /* Synchronously run film conversion kernel and store display result in the given destination. */
+ void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+ PassMode pass_mode,
+ int num_samples);
+
+ int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+ void enqueue_adaptive_sampling_filter_x();
+ void enqueue_adaptive_sampling_filter_y();
+
+ bool has_shadow_catcher() const;
+
+ /* Count how many currently scheduled paths can still split. */
+ int shadow_catcher_count_possible_splits();
+
+ /* Integrator queue. */
+ unique_ptr<DeviceQueue> queue_;
+
+ /* Scheduler which gives work to path tracing threads. */
+ WorkTileScheduler work_tile_scheduler_;
+
+ /* Integrate state for paths. */
+ IntegratorStateGPU integrator_state_gpu_;
+ /* SoA arrays for integrator state. */
+ vector<unique_ptr<device_memory>> integrator_state_soa_;
+ uint integrator_state_soa_kernel_features_;
+ /* Keep track of number of queued kernels. */
+ device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+ /* Shader sorting. */
+ device_vector<int> integrator_shader_sort_counter_;
+ device_vector<int> integrator_shader_raytrace_sort_counter_;
+ /* Path split. */
+ device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+ /* Temporary buffer to get an array of queued path for a particular kernel. */
+ device_vector<int> queued_paths_;
+ device_vector<int> num_queued_paths_;
+
+ /* Temporary buffer for passing work tiles to kernel. */
+ device_vector<KernelWorkTile> work_tiles_;
+
+ /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+ * available. Is allocated on-demand. */
+ device_vector<half4> gpu_display_rgba_half_;
+
+ unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+ /* Cached result of device->should_use_graphics_interop(). */
+ bool interop_use_checked_ = false;
+ bool interop_use_ = false;
+
+ /* Maximum number of concurrent integrator states. */
+ int max_num_paths_;
+
+ /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+ * this value more work will be scheduled. */
+ int min_num_active_paths_;
+
+ /* Maximum path index, effective number of paths used may be smaller than
+ * the size of the integrator_state_ buffer so can avoid iterating over the
+ * full buffer. */
+ int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..3e5b3417a6a
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+ : headless_(params.headless),
+ background_(params.background),
+ pixel_size_(params.pixel_size),
+ tile_manager_(tile_manager),
+ default_start_resolution_divider_(pixel_size_ * 8)
+{
+ use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+ need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+ need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+ return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+ denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+ adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+ return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+ start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+ return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+ num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+ return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+ time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+ return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+ DCHECK_GT(get_num_rendered_samples(), 0);
+
+ return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+ return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+ buffer_params_ = buffer_params;
+
+ update_start_resolution_divider();
+
+ set_num_samples(num_samples);
+
+ /* In background mode never do lower resolution render preview, as it is not really supported
+ * by the software. */
+ if (background_) {
+ state_.resolution_divider = 1;
+ }
+ else {
+ /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+ * first and then initialized render work. */
+ state_.resolution_divider = start_resolution_divider_ * 2;
+ }
+
+ state_.num_rendered_samples = 0;
+ state_.last_display_update_time = 0.0;
+ state_.last_display_update_sample = -1;
+
+ state_.last_rebalance_time = 0.0;
+ state_.num_rebalance_requested = 0;
+ state_.num_rebalance_changes = 0;
+ state_.last_rebalance_changed = false;
+ state_.need_rebalance_at_next_work = false;
+
+ /* TODO(sergey): Choose better initial value. */
+ /* NOTE: The adaptive sampling settings might not be available here yet. */
+ state_.adaptive_sampling_threshold = 0.4f;
+
+ state_.last_work_tile_was_denoised = false;
+ state_.tile_result_was_written = false;
+ state_.postprocess_work_scheduled = false;
+ state_.full_frame_work_scheduled = false;
+ state_.full_frame_was_written = false;
+
+ state_.path_trace_finished = false;
+
+ state_.start_render_time = 0.0;
+ state_.end_render_time = 0.0;
+ state_.time_limit_reached = false;
+
+ state_.occupancy_num_samples = 0;
+ state_.occupancy = 1.0f;
+
+ first_render_time_.path_trace_per_sample = 0.0;
+ first_render_time_.denoise_time = 0.0;
+ first_render_time_.display_update_time = 0.0;
+
+ path_trace_time_.reset();
+ denoise_time_.reset();
+ adaptive_filter_time_.reset();
+ display_update_time_.reset();
+ rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+ reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+ /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+ * navigation. */
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (render_work_reschedule_on_idle(render_work)) {
+ return true;
+ }
+
+ state_.path_trace_finished = true;
+
+ bool denoiser_delayed, denoiser_ready_to_display;
+ render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+ render_work.display.update = work_need_update_display(denoiser_delayed);
+ render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+ return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+ if (!use_progressive_noise_floor_) {
+ return false;
+ }
+
+ /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+ * navigation. */
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (adaptive_sampling_.use) {
+ if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+ state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+ adaptive_sampling_.threshold);
+
+ render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+ render_work.adaptive_sampling.reset = true;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+ VLOG(3) << "Schedule work for cancel.";
+
+ /* Un-schedule samples: they will not be rendered and should not be counted. */
+ state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+ const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+ /* Reset all fields of the previous work, canceling things like adaptive sampling filtering and
+ * denoising.
+ * However, need to preserve write requests, since those will not be possible to recover and
+ * writes are only to happen once. */
+ const bool tile_write = render_work.tile.write;
+ const bool full_write = render_work.full.write;
+
+ render_work = RenderWork();
+
+ render_work.tile.write = tile_write;
+ render_work.full.write = full_write;
+
+ /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+ * got canceled. */
+ if (!state_.tile_result_was_written && has_rendered_samples) {
+ render_work.tile.write = true;
+ }
+
+ if (!state_.full_frame_was_written) {
+ render_work.full.write = true;
+ }
+
+ /* Update current tile, but only if any sample was rendered.
+ * Allows to have latest state of tile visible while full buffer is being processed.
+ *
+ * Note that if there are no samples in the current tile its render buffer might have pixels
+ * remained from previous state.
+ *
+ * If the full result was written, then there is no way any updates were made to the render
+ * buffers. And the buffers might have been freed from the device, so display update is not
+ * possible. */
+ if (has_rendered_samples && !state_.full_frame_was_written) {
+ render_work.display.update = true;
+ }
+}
+
+bool RenderScheduler::done() const
+{
+ if (state_.resolution_divider != pixel_size_) {
+ return false;
+ }
+
+ if (state_.path_trace_finished || state_.time_limit_reached) {
+ return true;
+ }
+
+ return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+ check_time_limit_reached();
+
+ const double time_now = time_dt();
+
+ if (done()) {
+ RenderWork render_work;
+ render_work.resolution_divider = state_.resolution_divider;
+
+ if (!set_postprocess_render_work(&render_work)) {
+ set_full_frame_render_work(&render_work);
+ }
+
+ if (!render_work) {
+ state_.end_render_time = time_now;
+ }
+
+ update_state_for_render_work(render_work);
+
+ return render_work;
+ }
+
+ RenderWork render_work;
+
+ if (state_.resolution_divider != pixel_size_) {
+ state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+ state_.num_rendered_samples = 0;
+ state_.last_display_update_sample = -1;
+ }
+
+ render_work.resolution_divider = state_.resolution_divider;
+
+ render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+ render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+ render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+ /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+ render_work.rebalance = work_need_rebalance();
+
+ /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+ * samples are rendered. */
+ state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+ render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+ render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+ render_work.adaptive_sampling.reset = false;
+
+ bool denoiser_delayed, denoiser_ready_to_display;
+ render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+ render_work.tile.write = done();
+
+ render_work.display.update = work_need_update_display(denoiser_delayed);
+ render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+ if (done()) {
+ set_postprocess_render_work(&render_work);
+ }
+
+ update_state_for_render_work(render_work);
+
+ return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+ const double time_now = time_dt();
+
+ if (render_work.rebalance) {
+ state_.last_rebalance_time = time_now;
+ ++state_.num_rebalance_requested;
+ }
+
+ /* A fallback display update time, for the case there is an error of display update, or when
+ * there is no display at all. */
+ if (render_work.display.update) {
+ state_.last_display_update_time = time_now;
+ state_.last_display_update_sample = state_.num_rendered_samples;
+ }
+
+ state_.last_work_tile_was_denoised = render_work.tile.denoise;
+ state_.tile_result_was_written |= render_work.tile.write;
+ state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+ if (state_.postprocess_work_scheduled) {
+ return false;
+ }
+ state_.postprocess_work_scheduled = true;
+
+ bool any_scheduled = false;
+
+ if (need_schedule_cryptomatte_) {
+ render_work->cryptomatte.postprocess = true;
+ any_scheduled = true;
+ }
+
+ if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+ render_work->tile.denoise = true;
+ any_scheduled = true;
+ }
+
+ if (!state_.tile_result_was_written) {
+ render_work->tile.write = true;
+ any_scheduled = true;
+ }
+
+ if (any_scheduled) {
+ render_work->display.update = true;
+ }
+
+ return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+ if (state_.full_frame_work_scheduled) {
+ return;
+ }
+
+ if (!tile_manager_.has_multiple_tiles()) {
+ /* There is only single tile, so all work has been performed already. */
+ return;
+ }
+
+ if (!tile_manager_.done()) {
+ /* There are still tiles to be rendered. */
+ return;
+ }
+
+ if (state_.full_frame_was_written) {
+ return;
+ }
+
+ state_.full_frame_work_scheduled = true;
+
+ render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+ if (render_work.resolution_divider == 1) {
+ return time;
+ }
+
+ const double resolution_divider_sq = render_work.resolution_divider *
+ render_work.resolution_divider;
+ return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+ /* Start counting render time when rendering samples at their final resolution.
+ *
+ * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+ * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+ * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+ * expected to happen in the first work. */
+ if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+ render_work.path_trace.start_sample == get_start_sample()) {
+ state_.start_render_time = time_dt();
+ }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+ double time,
+ bool is_cancelled)
+{
+ path_trace_time_.add_wall(time);
+
+ if (is_cancelled) {
+ return;
+ }
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.path_trace_per_sample = final_time_approx /
+ render_work.path_trace.num_samples;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ path_trace_time_.reset_average();
+ }
+
+ path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+ VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+ state_.occupancy_num_samples = render_work.path_trace.num_samples;
+ state_.occupancy = occupancy;
+ VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+ double time,
+ bool is_cancelled)
+{
+ adaptive_filter_time_.add_wall(time);
+
+ if (is_cancelled) {
+ return;
+ }
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_report_reset_average(render_work)) {
+ adaptive_filter_time_.reset_average();
+ }
+
+ adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+ VLOG(4) << "Average adaptive sampling filter time: " << adaptive_filter_time_.get_average()
+ << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+ denoise_time_.add_wall(time);
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.denoise_time = final_time_approx;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ denoise_time_.reset_average();
+ }
+
+ denoise_time_.add_average(final_time_approx);
+
+ VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+ display_update_time_.add_wall(time);
+
+ const double final_time_approx = approximate_final_time(render_work, time);
+
+ if (work_is_usable_for_first_render_estimation(render_work)) {
+ first_render_time_.display_update_time = final_time_approx;
+ }
+
+ if (work_report_reset_average(render_work)) {
+ display_update_time_.reset_average();
+ }
+
+ display_update_time_.add_average(final_time_approx);
+
+ VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+ /* Move the display update moment further in time, so that logic which checks when last update
+ * did happen have more reliable point in time (without path tracing and denoising parts of the
+ * render work). */
+ state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+ double time,
+ bool balance_changed)
+{
+ rebalance_time_.add_wall(time);
+
+ if (work_report_reset_average(render_work)) {
+ rebalance_time_.reset_average();
+ }
+
+ rebalance_time_.add_average(time);
+
+ if (balance_changed) {
+ ++state_.num_rebalance_changes;
+ }
+
+ state_.last_rebalance_changed = balance_changed;
+
+ VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+ const double render_wall_time = state_.end_render_time - state_.start_render_time;
+ const int num_rendered_samples = get_num_rendered_samples();
+
+ string result = "\nRender Scheduler Summary\n\n";
+
+ {
+ string mode;
+ if (headless_) {
+ mode = "Headless";
+ }
+ else if (background_) {
+ mode = "Background";
+ }
+ else {
+ mode = "Interactive";
+ }
+ result += "Mode: " + mode + "\n";
+ }
+
+ result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+ to_string(buffer_params_.height) + "\n";
+
+ result += "\nAdaptive sampling:\n";
+ result += " Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+ if (adaptive_sampling_.use) {
+ result += " Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+ result += " Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+ result += " Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+ }
+
+ result += "\nDenoiser:\n";
+ result += " Use: " + string_from_bool(denoiser_params_.use) + "\n";
+ if (denoiser_params_.use) {
+ result += " Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+ result += " Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+ string passes = "Color";
+ if (denoiser_params_.use_pass_albedo) {
+ passes += ", Albedo";
+ }
+ if (denoiser_params_.use_pass_normal) {
+ passes += ", Normal";
+ }
+
+ result += " Passes: " + passes + "\n";
+ }
+
+ if (state_.num_rebalance_requested) {
+ result += "\nRebalancer:\n";
+ result += " Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+ "\n";
+ result += " Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+ "\n";
+ }
+
+ result += "\nTime (in seconds):\n";
+ result += string_printf(" %20s %20s %20s\n", "", "Wall", "Average");
+ result += string_printf(" %20s %20f %20f\n",
+ "Path Tracing",
+ path_trace_time_.get_wall(),
+ path_trace_time_.get_average());
+
+ if (adaptive_sampling_.use) {
+ result += string_printf(" %20s %20f %20f\n",
+ "Adaptive Filter",
+ adaptive_filter_time_.get_wall(),
+ adaptive_filter_time_.get_average());
+ }
+
+ if (denoiser_params_.use) {
+ result += string_printf(
+ " %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+ }
+
+ result += string_printf(" %20s %20f %20f\n",
+ "Display Update",
+ display_update_time_.get_wall(),
+ display_update_time_.get_average());
+
+ if (state_.num_rebalance_requested) {
+ result += string_printf(" %20s %20f %20f\n",
+ "Rebalance",
+ rebalance_time_.get_wall(),
+ rebalance_time_.get_average());
+ }
+
+ const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+ denoise_time_.get_wall() + display_update_time_.get_wall();
+ result += "\n Total: " + to_string(total_time) + "\n";
+
+ result += string_printf(
+ "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+ /* When adaptive sampling is used the average time becomes meaningless, because different samples
+ * will likely render different number of pixels. */
+ if (!adaptive_sampling_.use) {
+ result += string_printf("Average time per sample: %f seconds\n",
+ render_wall_time / num_rendered_samples);
+ }
+
+ return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+ return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+ int num_rendered_samples) const
+{
+ double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ num_rendered_samples);
+
+ if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+ const double remaining_render_time = max(0.0,
+ time_limit_ - (time_dt() - state_.start_render_time));
+
+ update_interval = min(update_interval, remaining_render_time);
+ }
+
+ return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ int num_rendered_samples) const
+{
+ /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+ * within the current render session, or use absolute number of samples with the start sample
+ * taken into account. It will depend on whether the start sample offset clears the render
+ * buffer. */
+
+ if (state_.need_rebalance_at_next_work) {
+ return 0.1;
+ }
+ if (state_.last_rebalance_changed) {
+ return 0.2;
+ }
+
+ if (headless_) {
+ /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+ * progress messages printed to the logs. */
+ return 30.0;
+ }
+
+ if (background_) {
+ if (num_rendered_samples < 32) {
+ return 1.0;
+ }
+ return 2.0;
+ }
+
+ /* Render time and number of samples rendered are used to figure out the display update interval.
+ * Render time is used to allow for fast display updates in the first few seconds of rendering
+ * on fast devices. Number of samples rendered is used to allow for potentially quicker display
+ * updates on slow devices during the first few samples. */
+ const double render_time = path_trace_time_.get_wall();
+ if (render_time < 1) {
+ return 0.1;
+ }
+ if (render_time < 2) {
+ return 0.25;
+ }
+ if (render_time < 4) {
+ return 0.5;
+ }
+ if (render_time < 8 || num_rendered_samples < 32) {
+ return 1.0;
+ }
+ return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+ const double time_per_sample_average = path_trace_time_.get_average();
+ const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+ const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+ return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+ return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+ if (num_samples == 1) {
+ return 1;
+ }
+
+ if (is_power_of_two(num_samples)) {
+ return num_samples;
+ }
+
+ const uint num_samples_up = next_power_of_two(num_samples);
+ const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+ const uint delta_up = num_samples_up - num_samples;
+ const uint delta_down = num_samples - num_samples_down;
+
+ if (delta_up <= delta_down) {
+ return num_samples_up;
+ }
+
+ return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+ if (state_.resolution_divider != pixel_size_) {
+ return get_num_samples_during_navigation(state_.resolution_divider);
+ }
+
+ /* Always start full resolution render with a single sample. Gives more instant feedback to
+ * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+ * headless mode as well, to give some estimate of how long samples are taking. */
+ if (state_.num_rendered_samples == 0) {
+ return 1;
+ }
+
+ const int num_samples_per_update = calculate_num_samples_per_update();
+ const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+ /* Round number of samples to a power of two, so that division of path states into tiles goes in
+ * a more integer manner.
+ * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+ * is not huge deal because it is not seen that more than 8 samples can be rendered between
+ * updates. If that becomes a problem we can add some extra rules like never allow to round up
+ * more than N samples. */
+ const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+ const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+ int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+ /* When enough statistics is available and doing an offline rendering prefer to keep device
+ * occupied. */
+ if (state_.occupancy_num_samples && (background_ || headless_)) {
+ /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+ * with good performance without forcing occupancy to be higher). */
+ int num_samples_to_occupy = state_.occupancy_num_samples;
+ if (state_.occupancy < 0.5f) {
+ num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+ }
+
+ num_samples_to_render = max(num_samples_to_render,
+ min(num_samples_to_occupy, max_num_samples_to_render));
+ }
+
+ /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+ * device fully occupied, without much overhead of display updates. */
+ if (!adaptive_sampling_.use) {
+ return num_samples_to_render;
+ }
+
+ /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+ * is to ensure that the final render is pixel-matched regardless of how many samples per second
+ * compute device can do. */
+
+ return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+ /* Special trick for fast navigation: schedule multiple samples during fast navigation
+ * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+ * usable visual feedback for artists. There are a couple of tricks though. */
+
+ if (is_denoise_active_during_update()) {
+ /* When denoising is used during navigation prefer using a higher resolution with less samples
+ * (scheduling less samples here will make it so the resolution_divider calculation will use a
+ * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+ * give visually better results on a higher resolution image with less samples. */
+ return 1;
+ }
+
+ if (resolution_divider <= pixel_size_) {
+ /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+ * the sample count at this resolution division, but instead assists in the calculation of
+ * the resolution divider. */
+ return 1;
+ }
+
+ if (resolution_divider == pixel_size_ * 2) {
+ /* When resolution divider is the previous step to the final resolution, schedule two samples.
+ * This is so that rendering on lower resolution does not exceed time that it takes to render
+ * first sample at the full resolution. */
+ return 2;
+ }
+
+ /* Always render 4 samples, even if scene is configured for less.
+ * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+ * to have 4 time extra samples, so overall worst case timing is the same as the final resolution
+ * at one sample. */
+ return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+ return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+ if (!use_progressive_noise_floor_) {
+ return adaptive_sampling_.threshold;
+ }
+
+ return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+ delayed = false;
+ ready_to_display = true;
+
+ if (!denoiser_params_.use) {
+ /* Denoising is disabled, no need to scheduler work for it. */
+ return false;
+ }
+
+ if (done()) {
+ /* Always denoise at the last sample. */
+ return true;
+ }
+
+ if (background_) {
+ /* Background render, only denoise when rendering the last sample. */
+ /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+ * image looks like even for the background rendering. */
+ return false;
+ }
+
+ /* Viewport render. */
+
+ /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+ * final samples. */
+ const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+ state_.num_rendered_samples :
+ 1;
+
+ /* Immediately denoise when we reach the start sample or last sample. */
+ if (num_samples_finished == denoiser_params_.start_sample ||
+ num_samples_finished == num_samples_) {
+ return true;
+ }
+
+ /* Do not denoise until the sample at which denoising should start is reached. */
+ if (num_samples_finished < denoiser_params_.start_sample) {
+ ready_to_display = false;
+ return false;
+ }
+
+ /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+ */
+ /* TODO(sergey): Consider making time interval and sample configurable. */
+ delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+ (time_dt() - state_.last_display_update_time) < 1.0);
+
+ return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+ if (headless_) {
+ /* Force disable display update in headless mode. There will be nothing to display the
+ * in-progress result. */
+ return false;
+ }
+
+ if (denoiser_delayed) {
+ /* If denoiser has been delayed the display can not be updated as it will not contain
+ * up-to-date state of the render result. */
+ return false;
+ }
+
+ if (!adaptive_sampling_.use) {
+ /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+ * device busy for long enough, so that the display update can happen right after the
+ * rendering. */
+ return true;
+ }
+
+ if (done() || state_.last_display_update_sample == -1) {
+ /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+ */
+ return true;
+ }
+
+ /* For the development purposes of adaptive sampling it might be very useful to see all updates
+ * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+ * users. Possibly, make it a debug panel option to allow rapid update to ease development
+ * without need to re-compiled. */
+ // if (work_need_adaptive_filter()) {
+ // return true;
+ // }
+
+ /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+ * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+ * We take care of skipping updates here based on when previous display update did happen. */
+ const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+ state_.last_display_update_sample);
+ return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+ /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+ * work. */
+ static const double kRebalanceIntervalInSeconds = 1;
+
+ if (!need_schedule_rebalance_works_) {
+ return false;
+ }
+
+ if (state_.resolution_divider != pixel_size_) {
+ /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+ * - It will introduce unnecessary during navigation.
+ * - Per-render device timing information is not very reliable yet. */
+ return false;
+ }
+
+ if (state_.num_rendered_samples == 0) {
+ state_.need_rebalance_at_next_work = true;
+ return false;
+ }
+
+ if (state_.need_rebalance_at_next_work) {
+ state_.need_rebalance_at_next_work = false;
+ return true;
+ }
+
+ if (state_.last_rebalance_changed) {
+ return true;
+ }
+
+ return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+ if (start_resolution_divider_ == 0) {
+ /* Resolution divider has never been calculated before: use default resolution, so that we have
+ * somewhat good initial behavior, giving a chance to collect real numbers. */
+ start_resolution_divider_ = default_start_resolution_divider_;
+ VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+ return;
+ }
+
+ if (first_render_time_.path_trace_per_sample == 0.0) {
+ /* Not enough information to calculate better resolution, keep the existing one. */
+ return;
+ }
+
+ const double desired_update_interval_in_seconds =
+ guess_viewport_navigation_update_interval_in_seconds();
+
+ const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+ first_render_time_.denoise_time +
+ first_render_time_.display_update_time;
+
+ /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+ * resolution we prefer to use it instead of going way lower resolution and time way below the
+ * desired one. */
+ const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+ desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+ /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+ * render time is somewhere on a boundary between two resolutions. */
+
+ /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+ * simple and compute device is fast). */
+ start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+ VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+ if (is_denoise_active_during_update()) {
+ /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+ * image from. With the faster updates and extra compute required the resolution becomes too
+ * low to give usable feedback. */
+ /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+ * on GPU the value might need to become lower for faster navigation. */
+ return 1.0 / 12.0;
+ }
+
+ /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+ * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+ * values of the resolution divider which does not give very pleasant updates during navigation.
+ * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+ /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+ * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+ return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+ if (!denoiser_params_.use) {
+ return false;
+ }
+
+ if (denoiser_params_.start_sample > 1) {
+ return false;
+ }
+
+ return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+ return render_work.resolution_divider == pixel_size_ &&
+ render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+ /* When rendering at a non-final resolution divider time average is not very useful because it
+ * will either bias average down (due to lower render times on the smaller images) or will give
+ * incorrect result when trying to estimate time which would have spent on the final resolution.
+ *
+ * So we only accumulate average for the latest resolution divider which was rendered. */
+ return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+ if (time_limit_ == 0.0) {
+ /* No limit is enforced. */
+ return;
+ }
+
+ if (state_.start_render_time == 0.0) {
+ /* Rendering did not start yet. */
+ return;
+ }
+
+ const double current_time = time_dt();
+
+ if (current_time - state_.start_render_time < time_limit_) {
+ /* Time limit is not reached yet. */
+ return;
+ }
+
+ state_.time_limit_reached = true;
+ state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+ /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+ int resolution_divider = 1;
+
+ /* This algorithm iterates through resolution dividers until a divider is found that achieves
+ * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+ * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+ * pre_resolution_division_samples and post_resolution_division_samples are used in this
+ * calculation to better predict the performance impact of changing resolution divisions as
+ * the sample count can also change between resolution divisions. */
+ while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+ int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+ resolution_divider = resolution_divider * 2;
+ int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+ actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+ }
+
+ return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+ if (resolution == INT_MAX) {
+ return 1;
+ }
+
+ int resolution_divider = 1;
+ while (width * height > resolution * resolution) {
+ width = max(1, width / 2);
+ height = max(1, height / 2);
+
+ resolution_divider <<= 1;
+ }
+
+ return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+ const int pixel_area = width * height;
+ const int resolution = lround(sqrt(pixel_area));
+
+ return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..b7b598fb10c
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+ int resolution_divider = 1;
+
+ /* Initialize render buffers.
+ * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+ * baking target. */
+ bool init_render_buffers = false;
+
+ /* Path tracing samples information. */
+ struct {
+ int start_sample = 0;
+ int num_samples = 0;
+ } path_trace;
+
+ struct {
+ /* Check for convergency and filter the mask. */
+ bool filter = false;
+
+ float threshold = 0.0f;
+
+ /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+ bool reset = false;
+ } adaptive_sampling;
+
+ struct {
+ bool postprocess = false;
+ } cryptomatte;
+
+ /* Work related on the current tile. */
+ struct {
+ /* Write render buffers of the current tile.
+ *
+ * It is up to the path trace to decide whether writing should happen via user-provided
+ * callback into the rendering software, or via tile manager into a partial file. */
+ bool write = false;
+
+ bool denoise = false;
+ } tile;
+
+ /* Work related on the full-frame render buffer. */
+ struct {
+ /* Write full render result.
+ * Implies reading the partial file from disk. */
+ bool write = false;
+ } full;
+
+ /* Display which is used to visualize render result. */
+ struct {
+ /* Display needs to be updated for the new render. */
+ bool update = false;
+
+ /* Display can use denoised result if available. */
+ bool use_denoised_result = true;
+ } display;
+
+ /* Re-balance multi-device scheduling after rendering this work.
+ * Note that the scheduler does not know anything about devices, so if there is only a single
+ * device used, then it is up for the PathTracer to ignore the balancing. */
+ bool rebalance = false;
+
+ /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+ * work. */
+ inline operator bool() const
+ {
+ return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+ tile.write || full.write;
+ }
+};
+
+class RenderScheduler {
+ public:
+ RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+ /* Specify whether cryptomatte-related works are to be scheduled. */
+ void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+ /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+ * as possible. */
+ void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+ bool is_background() const;
+
+ void set_denoiser_params(const DenoiseParams &params);
+ void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+ bool is_adaptive_sampling_used() const;
+
+ /* Start sample for path tracing.
+ * The scheduler will schedule work using this sample as the first one. */
+ void set_start_sample(int start_sample);
+ int get_start_sample() const;
+
+ /* Number of samples to render, starting from start sample.
+ * The scheduler will schedule work in the range of
+ * [start_sample, start_sample + num_samples - 1], inclusively. */
+ void set_num_samples(int num_samples);
+ int get_num_samples() const;
+
+ /* Time limit for the path tracing tasks, in minutes.
+ * Zero disables the limit. */
+ void set_time_limit(double time_limit);
+ double get_time_limit() const;
+
+ /* Get sample up to which rendering has been done.
+ * This is an absolute 0-based value.
+ *
+ * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+ * return 14.
+ *
+ * If there were no samples rendered, then the behavior is undefined. */
+ int get_rendered_sample() const;
+
+ /* Get number of samples rendered within the current scheduling session.
+ *
+ * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+ * return 5.
+ *
+ * Note that this is based on the scheduling information. In practice this means that if someone
+ * requested for work to render the scheduler considers the work done. */
+ int get_num_rendered_samples() const;
+
+ /* Reset scheduler, indicating that rendering will happen from scratch.
+ * Resets current rendered state, as well as scheduling information. */
+ void reset(const BufferParams &buffer_params, int num_samples);
+
+ /* Reset scheduler upon switching to a next tile.
+ * Will keep the same number of samples and full-frame render parameters, but will reset progress
+ * and allow schedule renders works from the beginning of the new tile. */
+ void reset_for_next_tile();
+
+ /* Reschedule adaptive sampling work when all pixels did converge.
+ * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+ * final threshold) then false is returned and the render scheduler will stop scheduling path
+ * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+ * a lower threshold. */
+ bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+ /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+ * converged.
+ * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+ * the path tracer is to finish the current pixels) then false is returned. */
+ bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+ /* Reschedule work when rendering has been requested to cancel.
+ *
+ * Will skip all work which is not needed anymore because no more samples will be added (for
+ * example, adaptive sampling filtering and convergence check will be skipped).
+ * Will enable all work needed to make sure all passes are communicated to the software.
+ *
+ * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+ void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+ RenderWork get_render_work();
+
+ /* Report that the path tracer started to work, after scene update and loading kernels. */
+ void report_work_begin(const RenderWork &render_work);
+
+ /* Report time (in seconds) which corresponding part of work took. */
+ void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+ void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+ void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+ void report_denoise_time(const RenderWork &render_work, double time);
+ void report_display_update_time(const RenderWork &render_work, double time);
+ void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+ /* Generate full multi-line report of the rendering process, including rendering parameters,
+ * times, and so on. */
+ string full_report() const;
+
+ protected:
+ /* Check whether all work has been scheduled and time limit was not exceeded.
+ *
+ * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+ * extra work needs to be scheduled to denoise and write final result. */
+ bool done() const;
+
+ /* Update scheduling state for a newly scheduled work.
+ * Takes care of things like checking whether work was ever denoised, tile was written and states
+ * like that. */
+ void update_state_for_render_work(const RenderWork &render_work);
+
+ /* Returns true if any work was scheduled. */
+ bool set_postprocess_render_work(RenderWork *render_work);
+
+ /* Set work which is to be performed after all tiles has been rendered. */
+ void set_full_frame_render_work(RenderWork *render_work);
+
+ /* Update start resolution divider based on the accumulated timing information, preserving nice
+ * feeling navigation feel. */
+ void update_start_resolution_divider();
+
+ /* Calculate desired update interval in seconds based on the current timings and settings.
+ * Will give an interval which provides good feeling updates during viewport navigation. */
+ double guess_viewport_navigation_update_interval_in_seconds() const;
+
+ /* Check whether denoising is active during interactive update while resolution divider is not
+ * unit. */
+ bool is_denoise_active_during_update() const;
+
+ /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+ * lower samples and near the beginning of rendering, updates happen more often, but with higher
+ * number of samples and later in the render, updates happen less often but device occupancy
+ * goes higher. */
+ double guess_display_update_interval_in_seconds() const;
+ double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+ double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+ int num_rendered_samples) const;
+
+ /* Calculate number of samples which can be rendered within current desired update interval which
+ * is calculated by `guess_update_interval_in_seconds()`. */
+ int calculate_num_samples_per_update() const;
+
+ /* Get start sample and the number of samples which are to be path traces in the current work. */
+ int get_start_sample_to_path_trace() const;
+ int get_num_samples_to_path_trace() const;
+
+ /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+ */
+ int get_num_samples_during_navigation(int resolution_divier) const;
+
+ /* Whether adaptive sampling convergence check and filter is to happen. */
+ bool work_need_adaptive_filter() const;
+
+ /* Calculate threshold for adaptive sampling. */
+ float work_adaptive_threshold() const;
+
+ /* Check whether current work needs denoising.
+ * Denoising is not needed if the denoiser is not configured, or when denoising is happening too
+ * often.
+ *
+ * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+ * later sample, to reduce overhead.
+ *
+ * ready_to_display will be false if we may have a denoised result that is outdated due to
+ * increased samples. */
+ bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+ /* Check whether current work need to update display.
+ *
+ * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+ bool work_need_update_display(const bool denoiser_delayed);
+
+ /* Check whether it is time to perform rebalancing for the render work, */
+ bool work_need_rebalance();
+
+ /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+ * for the resolution divider calculation. */
+ bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+ /* Check whether timing report about the given work need to reset accumulated average time. */
+ bool work_report_reset_average(const RenderWork &render_work);
+
+ /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+ * information in the state so that rendering is considered finished, and is possible to report
+ * average render time information. */
+ void check_time_limit_reached();
+
+ /* Helper class to keep track of task timing.
+ *
+ * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+ * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+ *
+ * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+ * takes to perform task on the final resolution. */
+ class TimeWithAverage {
+ public:
+ inline void reset()
+ {
+ total_wall_time_ = 0.0;
+
+ average_time_accumulator_ = 0.0;
+ num_average_times_ = 0;
+ }
+
+ inline void add_wall(double time)
+ {
+ total_wall_time_ += time;
+ }
+
+ inline void add_average(double time, int num_measurements = 1)
+ {
+ average_time_accumulator_ += time;
+ num_average_times_ += num_measurements;
+ }
+
+ inline double get_wall() const
+ {
+ return total_wall_time_;
+ }
+
+ inline double get_average() const
+ {
+ if (num_average_times_ == 0) {
+ return 0;
+ }
+ return average_time_accumulator_ / num_average_times_;
+ }
+
+ inline void reset_average()
+ {
+ average_time_accumulator_ = 0.0;
+ num_average_times_ = 0;
+ }
+
+ protected:
+ double total_wall_time_ = 0.0;
+
+ double average_time_accumulator_ = 0.0;
+ int num_average_times_ = 0;
+ };
+
+ struct {
+ int resolution_divider = 1;
+
+ /* Number of rendered samples on top of the start sample. */
+ int num_rendered_samples = 0;
+
+ /* Point in time the latest GPUDisplay work has been scheduled. */
+ double last_display_update_time = 0.0;
+ /* Value of -1 means display was never updated. */
+ int last_display_update_sample = -1;
+
+ /* Point in time at which last rebalance has been performed. */
+ double last_rebalance_time = 0.0;
+
+ /* Number of rebalance works which has been requested to be performed.
+ * The path tracer might ignore the work if there is a single device rendering. */
+ int num_rebalance_requested = 0;
+
+ /* Number of rebalance works handled which did change balance across devices. */
+ int num_rebalance_changes = 0;
+
+ bool need_rebalance_at_next_work = false;
+
+ /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+ * devices. */
+ bool last_rebalance_changed = false;
+
+ /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+ * noise floor. */
+ float adaptive_sampling_threshold = 0.0f;
+
+ bool last_work_tile_was_denoised = false;
+ bool tile_result_was_written = false;
+ bool postprocess_work_scheduled = false;
+ bool full_frame_work_scheduled = false;
+ bool full_frame_was_written = false;
+
+ bool path_trace_finished = false;
+ bool time_limit_reached = false;
+
+ /* Time at which rendering started and finished. */
+ double start_render_time = 0.0;
+ double end_render_time = 0.0;
+
+ /* Measured occupancy of the render devices measured normalized to the number of samples.
+ *
+ * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+ * previous work was rendered. */
+ int occupancy_num_samples = 0;
+ float occupancy = 1.0f;
+ } state_;
+
+ /* Timing of tasks which were performed at the very first render work at 100% of the
+ * resolution. This timing information is used to estimate resolution divider for fats
+ * navigation. */
+ struct {
+ double path_trace_per_sample;
+ double denoise_time;
+ double display_update_time;
+ } first_render_time_;
+
+ TimeWithAverage path_trace_time_;
+ TimeWithAverage adaptive_filter_time_;
+ TimeWithAverage denoise_time_;
+ TimeWithAverage display_update_time_;
+ TimeWithAverage rebalance_time_;
+
+ /* Whether cryptomatte-related work will be scheduled. */
+ bool need_schedule_cryptomatte_ = false;
+
+ /* Whether to schedule device load rebalance works.
+ * Rebalancing requires some special treatment for update intervals and such, so if it's known
+ * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+ * ignore rebalancing logic. */
+ bool need_schedule_rebalance_works_ = false;
+
+ /* Path tracing work will be scheduled for samples from within
+ * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+ int start_sample_ = 0;
+ int num_samples_ = 0;
+
+ /* Limit in seconds for how long path tracing is allowed to happen.
+ * Zero means no limit is applied. */
+ double time_limit_ = 0.0;
+
+ /* Headless rendering without interface. */
+ bool headless_;
+
+ /* Background (offline) rendering. */
+ bool background_;
+
+ /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+ * types of hi-dpi displays. */
+ int pixel_size_ = 1;
+
+ TileManager &tile_manager_;
+
+ BufferParams buffer_params_;
+ DenoiseParams denoiser_params_;
+
+ AdaptiveSampling adaptive_sampling_;
+
+ /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+ * level. */
+ bool use_progressive_noise_floor_ = false;
+
+ /* Default value for the resolution divider which will be used when there is no render time
+ * information available yet.
+ * It is also what defines the upper limit of the automatically calculated resolution divider. */
+ int default_start_resolution_divider_ = 1;
+
+ /* Initial resolution divider which will be used on render scheduler reset. */
+ int start_resolution_divider_ = 0;
+
+ /* Calculate smallest resolution divider which will bring down actual rendering time below the
+ * desired one. This call assumes linear dependency of render time from number of pixels
+ * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+ * down by a factor of 4. */
+ int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..d35ff4cd03f
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+ DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+ const int max_num_points,
+ const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+ const function<void(device_vector<float4> &)> &read_output)
+{
+ bool first_device = true;
+ bool success = true;
+
+ device_->foreach_device([&](Device *device) {
+ if (!first_device) {
+ LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+ "single device.";
+ return;
+ }
+ first_device = false;
+
+ device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+ device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+ /* Allocate and copy device buffers. */
+ DCHECK_EQ(input.device, device);
+ DCHECK_EQ(output.device, device);
+ DCHECK_LE(output.size(), input.size());
+
+ input.alloc(max_num_points);
+ int num_points = fill_input(input);
+ if (num_points == 0) {
+ return;
+ }
+
+ input.copy_to_device();
+ output.alloc(num_points);
+ output.zero_to_device();
+
+ /* Evaluate on CPU or GPU. */
+ success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+ eval_gpu(device, type, input, output);
+
+ /* Copy data back from device if not canceled. */
+ if (success) {
+ output.copy_from_device(0, 1, output.size());
+ read_output(output);
+ }
+
+ input.free();
+ output.free();
+ });
+
+ return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output)
+{
+ vector<CPUKernelThreadGlobals> kernel_thread_globals;
+ device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+ /* Find required kernel function. */
+ const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+ /* Simple parallel_for over all work items. */
+ const int64_t work_size = output.size();
+ KernelShaderEvalInput *input_data = input.data();
+ float4 *output_data = output.data();
+ bool success = true;
+
+ tbb::task_arena local_arena(device->info.cpu_threads);
+ local_arena.execute([&]() {
+ tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+ /* TODO: is this fast enough? */
+ if (progress_.get_cancel()) {
+ success = false;
+ return;
+ }
+
+ const int thread_index = tbb::this_task_arena::current_thread_index();
+ KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+ switch (type) {
+ case SHADER_EVAL_DISPLACE:
+ kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+ break;
+ case SHADER_EVAL_BACKGROUND:
+ kernels.shader_eval_background(kg, input_data, output_data, work_index);
+ break;
+ }
+ });
+ });
+
+ return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output)
+{
+ /* Find required kernel function. */
+ DeviceKernel kernel;
+ switch (type) {
+ case SHADER_EVAL_DISPLACE:
+ kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+ break;
+ case SHADER_EVAL_BACKGROUND:
+ kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+ break;
+ };
+
+ /* Create device queue. */
+ unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+ queue->init_execution();
+
+ /* Execute work on GPU in chunk, so we can cancel.
+ * TODO : query appropriate size from device.*/
+ const int chunk_size = 65536;
+
+ const int work_size = output.size();
+ void *d_input = (void *)input.device_pointer;
+ void *d_output = (void *)output.device_pointer;
+
+ for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+ int d_work_size = min(chunk_size, work_size - d_offset);
+ void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+ queue->enqueue(kernel, d_work_size, args);
+ queue->synchronize();
+
+ if (progress_.get_cancel()) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+ SHADER_EVAL_DISPLACE,
+ SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+ ShaderEval(Device *device, Progress &progress);
+
+ /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+ * RGBA colors to output. */
+ bool eval(const ShaderEvalType type,
+ const int max_num_points,
+ const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+ const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+ bool eval_cpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output);
+ bool eval_gpu(Device *device,
+ const ShaderEvalType type,
+ device_vector<KernelShaderEvalInput> &input,
+ device_vector<float4> &output);
+
+ Device *device_;
+ Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+ os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+ os << ", num_samples: " << tile_size.num_samples;
+ return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+ if (is_power_of_two(x)) {
+ return x;
+ }
+
+ return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+ if (is_power_of_two(x)) {
+ return x;
+ }
+
+ return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+ const int num_samples,
+ const int max_num_path_states)
+{
+ if (max_num_path_states == 1) {
+ /* Simple case: avoid any calculation, which could cause rounding issues. */
+ return TileSize(1, 1, 1);
+ }
+
+ const int64_t num_pixels = image_size.x * image_size.y;
+ const int64_t num_pixel_samples = num_pixels * num_samples;
+
+ if (max_num_path_states >= num_pixel_samples) {
+ /* Image fully fits into the state (could be border render, for example). */
+ return TileSize(image_size.x, image_size.y, num_samples);
+ }
+
+ /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+ * across threads.
+ *
+ * Some general ideas:
+ * - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+ * - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+ TileSize tile_size;
+
+ /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+ * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+ * multiple tiles with the same coordinates rendering different samples. */
+ const int num_path_states_per_sample = max_num_path_states / num_samples;
+ if (num_path_states_per_sample != 0) {
+ tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+ tile_size.height = tile_size.width;
+ }
+ else {
+ tile_size.width = tile_size.height = 1;
+ }
+
+ if (num_samples == 1) {
+ tile_size.num_samples = 1;
+ }
+ else {
+ /* Heuristic here is to have more uniform division of the sample range: for example prefer
+ * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+ tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+ static_cast<uint>(num_samples));
+
+ const int tile_area = tile_size.width / tile_size.height;
+ tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+ }
+
+ DCHECK_GE(tile_size.width, 1);
+ DCHECK_GE(tile_size.height, 1);
+ DCHECK_GE(tile_size.num_samples, 1);
+ DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+ return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+ TileSize() = default;
+
+ inline TileSize(int width, int height, int num_samples)
+ : width(width), height(height), num_samples(num_samples)
+ {
+ }
+
+ inline bool operator==(const TileSize &other) const
+ {
+ return width == other.width && height == other.height && num_samples == other.num_samples;
+ }
+ inline bool operator!=(const TileSize &other) const
+ {
+ return !(*this == other);
+ }
+
+ int width = 0, height = 0;
+ int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+ const int num_samples,
+ const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+ const int num_infos = work_balance_infos.size();
+
+ if (num_infos == 1) {
+ work_balance_infos[0].weight = 1.0;
+ return;
+ }
+
+ /* There is no statistics available, so start with an equal distribution. */
+ const double weight = 1.0 / num_infos;
+ for (WorkBalanceInfo &balance_info : work_balance_infos) {
+ balance_info.weight = weight;
+ }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+ double total_time = 0;
+ for (const WorkBalanceInfo &info : work_balance_infos) {
+ total_time += info.time_spent;
+ }
+ return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+ const int num_infos = work_balance_infos.size();
+
+ const double total_time = calculate_total_time(work_balance_infos);
+ const double time_average = total_time / num_infos;
+
+ double total_weight = 0;
+ vector<double> new_weights;
+ new_weights.reserve(num_infos);
+
+ /* Equalize the overall average time. This means that we don't make it so every work will perform
+ * amount of work based on the current average, but that after the weights changes the time will
+ * equalize.
+ * Can think of it that if one of the devices is 10% faster than another, then one device needs
+ * to do 5% less of the current work, and another needs to do 5% more. */
+ const double lerp_weight = 1.0 / num_infos;
+
+ bool has_big_difference = false;
+
+ for (const WorkBalanceInfo &info : work_balance_infos) {
+ const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+ const double new_weight = info.weight * time_target / info.time_spent;
+ new_weights.push_back(new_weight);
+ total_weight += new_weight;
+
+ if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+ has_big_difference = true;
+ }
+ }
+
+ if (!has_big_difference) {
+ return false;
+ }
+
+ const double total_weight_inv = 1.0 / total_weight;
+ for (int i = 0; i < num_infos; ++i) {
+ WorkBalanceInfo &info = work_balance_infos[i];
+ info.weight = new_weights[i] * total_weight_inv;
+ info.time_spent = 0;
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..fc5e561845e
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+ /* Time spent performing corresponding work. */
+ double time_spent = 0;
+
+ /* Average occupancy of the device while performing the work. */
+ float occupancy = 1.0f;
+
+ /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+ * the big tile which is to be rendered on the device). */
+ double weight = 1.0;
+};
+
+/* Balance work for an initial render integration, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..e6ada2f46ee
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+ max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+ /* Image buffer parameters. */
+ image_full_offset_px_.x = buffer_params.full_x;
+ image_full_offset_px_.y = buffer_params.full_y;
+
+ image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+ offset_ = buffer_params.offset;
+ stride_ = buffer_params.stride;
+
+ /* Samples parameters. */
+ sample_start_ = sample_start;
+ samples_num_ = samples_num;
+
+ /* Initialize new scheduling. */
+ reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+ tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+ VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+ if (VLOG_IS_ON(3)) {
+ /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+ * and purely focusing on the number of used path states. */
+ const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+ tile_size_.num_samples;
+ const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+ VLOG(3) << "Number of unused path states: "
+ << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+ }
+
+ num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+ num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+ total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+ num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+ next_work_index_ = 0;
+ total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+ /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+ * the path trace work can decide to use smaller tile sizes and greedily schedule multiple tiles,
+ * improving overall device occupancy.
+ * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+ * limit". */
+
+ DCHECK_NE(max_num_path_states_, 0);
+
+ const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+ if (work_index >= total_work_size_) {
+ return false;
+ }
+
+ const int sample_range_index = work_index % num_tiles_per_sample_range_;
+ const int start_sample = sample_range_index * tile_size_.num_samples;
+ const int tile_index = work_index / num_tiles_per_sample_range_;
+ const int tile_y = tile_index / num_tiles_x_;
+ const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+ KernelWorkTile work_tile;
+ work_tile.x = tile_x * tile_size_.width;
+ work_tile.y = tile_y * tile_size_.height;
+ work_tile.w = tile_size_.width;
+ work_tile.h = tile_size_.height;
+ work_tile.start_sample = sample_start_ + start_sample;
+ work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+ work_tile.offset = offset_;
+ work_tile.stride = stride_;
+
+ work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+ work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+ work_tile.x += image_full_offset_px_.x;
+ work_tile.y += image_full_offset_px_.y;
+
+ const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+ DCHECK_GT(tile_work_size, 0);
+
+ if (max_work_size && tile_work_size > max_work_size) {
+ /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+ * allowing others (or ourselves later one) to pick it up.
+ *
+ * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+ * device sees there is nothing to be done, finishing its work and leaving all work to be
+ * done by us. */
+ atomic_fetch_and_add_int32(&next_work_index_, -1);
+ return false;
+ }
+
+ *work_tile_ = work_tile;
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..85f11b601c7
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+ WorkTileScheduler();
+
+ /* MAximum path states which are allowed to be used by a single scheduled work tile.
+ *
+ * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+ * this number of states. */
+ void set_max_num_path_states(int max_num_path_states);
+
+ /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+ void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+ /* Get work for a device.
+ * Returns true if there is still work to be done and initialize the work tile to all
+ * parameters of this work. If there is nothing remaining to be done, returns false and the
+ * work tile is kept unchanged.
+ *
+ * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+ bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+ void reset_scheduler_state();
+
+ /* Maximum allowed path states to be used.
+ *
+ * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+ * number of path states is kind of a detail. Is there a more generic term from the scheduler
+ * point of view? */
+ int max_num_path_states_ = 0;
+
+ /* Offset in pixels within a global buffer. */
+ int2 image_full_offset_px_ = make_int2(0, 0);
+
+ /* dimensions of the currently rendering image in pixels. */
+ int2 image_size_px_ = make_int2(0, 0);
+
+ /* Offset and stride of the buffer within which scheduling is happening.
+ * Will be passed over to the KernelWorkTile. */
+ int offset_, stride_;
+
+ /* Start sample of index and number of samples which are to be rendered.
+ * The scheduler will cover samples range of [start, start + num] over the entire image
+ * (splitting into a smaller work tiles). */
+ int sample_start_ = 0;
+ int samples_num_ = 0;
+
+ /* Tile size which be scheduled for rendering. */
+ TileSize tile_size_;
+
+ /* Number of tiles in X and Y axis of the image. */
+ int num_tiles_x_, num_tiles_y_;
+
+ /* Total number of tiles on the image.
+ * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+ *
+ * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+ * in the `get_work()`? */
+ int total_tiles_num_ = 0;
+
+ /* In the case when the number of samples in the `tile_size_` is lower than samples_num_ denotes
+ * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+ int num_tiles_per_sample_range_ = 0;
+
+ int next_work_index_ = 0;
+ int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0ce33c51778..4196539a9b1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -22,68 +22,22 @@ set(INC_SYS
)
-set(SRC_CPU_KERNELS
- kernels/cpu/kernel.cpp
- kernels/cpu/kernel_sse2.cpp
- kernels/cpu/kernel_sse3.cpp
- kernels/cpu/kernel_sse41.cpp
- kernels/cpu/kernel_avx.cpp
- kernels/cpu/kernel_avx2.cpp
- kernels/cpu/kernel_split.cpp
- kernels/cpu/kernel_split_sse2.cpp
- kernels/cpu/kernel_split_sse3.cpp
- kernels/cpu/kernel_split_sse41.cpp
- kernels/cpu/kernel_split_avx.cpp
- kernels/cpu/kernel_split_avx2.cpp
- kernels/cpu/filter.cpp
- kernels/cpu/filter_sse2.cpp
- kernels/cpu/filter_sse3.cpp
- kernels/cpu/filter_sse41.cpp
- kernels/cpu/filter_avx.cpp
- kernels/cpu/filter_avx2.cpp
+set(SRC_DEVICE_CPU
+ device/cpu/kernel.cpp
+ device/cpu/kernel_sse2.cpp
+ device/cpu/kernel_sse3.cpp
+ device/cpu/kernel_sse41.cpp
+ device/cpu/kernel_avx.cpp
+ device/cpu/kernel_avx2.cpp
)
-set(SRC_CUDA_KERNELS
- kernels/cuda/kernel.cu
- kernels/cuda/kernel_split.cu
- kernels/cuda/filter.cu
+set(SRC_DEVICE_CUDA
+ device/cuda/kernel.cu
)
-set(SRC_OPENCL_KERNELS
- kernels/opencl/kernel_adaptive_stopping.cl
- kernels/opencl/kernel_adaptive_filter_x.cl
- kernels/opencl/kernel_adaptive_filter_y.cl
- kernels/opencl/kernel_adaptive_adjust_samples.cl
- kernels/opencl/kernel_bake.cl
- kernels/opencl/kernel_base.cl
- kernels/opencl/kernel_displace.cl
- kernels/opencl/kernel_background.cl
- kernels/opencl/kernel_state_buffer_size.cl
- kernels/opencl/kernel_split_bundle.cl
- kernels/opencl/kernel_data_init.cl
- kernels/opencl/kernel_path_init.cl
- kernels/opencl/kernel_queue_enqueue.cl
- kernels/opencl/kernel_scene_intersect.cl
- kernels/opencl/kernel_lamp_emission.cl
- kernels/opencl/kernel_do_volume.cl
- kernels/opencl/kernel_indirect_background.cl
- kernels/opencl/kernel_shader_setup.cl
- kernels/opencl/kernel_shader_sort.cl
- kernels/opencl/kernel_shader_eval.cl
- kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
- kernels/opencl/kernel_subsurface_scatter.cl
- kernels/opencl/kernel_direct_lighting.cl
- kernels/opencl/kernel_shadow_blocked_ao.cl
- kernels/opencl/kernel_shadow_blocked_dl.cl
- kernels/opencl/kernel_enqueue_inactive.cl
- kernels/opencl/kernel_next_iteration_setup.cl
- kernels/opencl/kernel_indirect_subsurface.cl
- kernels/opencl/kernel_buffer_update.cl
- kernels/opencl/filter.cl
-)
-
-set(SRC_OPTIX_KERNELS
- kernels/optix/kernel_optix.cu
+set(SRC_DEVICE_OPTIX
+ device/optix/kernel.cu
+ device/optix/kernel_shader_raytrace.cu
)
set(SRC_BVH_HEADERS
@@ -105,63 +59,56 @@ set(SRC_HEADERS
kernel_bake.h
kernel_camera.h
kernel_color.h
- kernel_compat_cpu.h
- kernel_compat_cuda.h
- kernel_compat_optix.h
- kernel_compat_opencl.h
kernel_differential.h
kernel_emission.h
kernel_film.h
- kernel_globals.h
kernel_id_passes.h
kernel_jitter.h
kernel_light.h
kernel_light_background.h
kernel_light_common.h
+ kernel_lookup_table.h
kernel_math.h
kernel_montecarlo.h
kernel_passes.h
- kernel_path.h
- kernel_path_branched.h
- kernel_path_common.h
kernel_path_state.h
- kernel_path_surface.h
- kernel_path_subsurface.h
- kernel_path_volume.h
kernel_profiling.h
kernel_projection.h
- kernel_queues.h
kernel_random.h
kernel_shader.h
- kernel_shadow.h
- kernel_subsurface.h
+ kernel_shadow_catcher.h
kernel_textures.h
kernel_types.h
- kernel_volume.h
kernel_work_stealing.h
kernel_write_passes.h
)
-set(SRC_KERNELS_CPU_HEADERS
- kernel.h
- kernels/cpu/kernel_cpu.h
- kernels/cpu/kernel_cpu_impl.h
- kernels/cpu/kernel_cpu_image.h
- kernels/cpu/filter_cpu.h
- kernels/cpu/filter_cpu_impl.h
+set(SRC_DEVICE_CPU_HEADERS
+ device/cpu/compat.h
+ device/cpu/image.h
+ device/cpu/globals.h
+ device/cpu/kernel.h
+ device/cpu/kernel_arch.h
+ device/cpu/kernel_arch_impl.h
)
-
-set(SRC_KERNELS_CUDA_HEADERS
- kernels/cuda/kernel_config.h
- kernels/cuda/kernel_cuda_image.h
+set(SRC_DEVICE_GPU_HEADERS
+ device/gpu/image.h
+ device/gpu/kernel.h
+ device/gpu/parallel_active_index.h
+ device/gpu/parallel_prefix_sum.h
+ device/gpu/parallel_reduce.h
+ device/gpu/parallel_sorted_index.h
)
-set(SRC_KERNELS_OPTIX_HEADERS
+set(SRC_DEVICE_CUDA_HEADERS
+ device/cuda/compat.h
+ device/cuda/config.h
+ device/cuda/globals.h
)
-set(SRC_KERNELS_OPENCL_HEADERS
- kernels/opencl/kernel_split_function.h
- kernels/opencl/kernel_opencl_image.h
+set(SRC_DEVICE_OPTIX_HEADERS
+ device/optix/compat.h
+ device/optix/globals.h
)
set(SRC_CLOSURE_HEADERS
@@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS
geom/geom_object.h
geom/geom_patch.h
geom/geom_primitive.h
+ geom/geom_shader_data.h
geom/geom_subd_triangle.h
geom/geom_triangle.h
geom/geom_triangle_intersect.h
geom/geom_volume.h
)
-set(SRC_FILTER_HEADERS
- filter/filter.h
- filter/filter_defines.h
- filter/filter_features.h
- filter/filter_features_sse.h
- filter/filter_kernel.h
- filter/filter_nlm_cpu.h
- filter/filter_nlm_gpu.h
- filter/filter_prefilter.h
- filter/filter_reconstruction.h
- filter/filter_transform.h
- filter/filter_transform_gpu.h
- filter/filter_transform_sse.h
+set(SRC_INTEGRATOR_HEADERS
+ integrator/integrator_init_from_bake.h
+ integrator/integrator_init_from_camera.h
+ integrator/integrator_intersect_closest.h
+ integrator/integrator_intersect_shadow.h
+ integrator/integrator_intersect_subsurface.h
+ integrator/integrator_intersect_volume_stack.h
+ integrator/integrator_megakernel.h
+ integrator/integrator_shade_background.h
+ integrator/integrator_shade_light.h
+ integrator/integrator_shade_shadow.h
+ integrator/integrator_shade_surface.h
+ integrator/integrator_shade_volume.h
+ integrator/integrator_state.h
+ integrator/integrator_state_flow.h
+ integrator/integrator_state_template.h
+ integrator/integrator_state_util.h
+ integrator/integrator_subsurface.h
+ integrator/integrator_volume_stack.h
)
set(SRC_UTIL_HEADERS
@@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS
../util/util_types_vector3_impl.h
)
-set(SRC_SPLIT_HEADERS
- split/kernel_adaptive_adjust_samples.h
- split/kernel_adaptive_filter_x.h
- split/kernel_adaptive_filter_y.h
- split/kernel_adaptive_stopping.h
- split/kernel_branched.h
- split/kernel_buffer_update.h
- split/kernel_data_init.h
- split/kernel_direct_lighting.h
- split/kernel_do_volume.h
- split/kernel_enqueue_inactive.h
- split/kernel_holdout_emission_blurring_pathtermination_ao.h
- split/kernel_indirect_background.h
- split/kernel_indirect_subsurface.h
- split/kernel_lamp_emission.h
- split/kernel_next_iteration_setup.h
- split/kernel_path_init.h
- split/kernel_queue_enqueue.h
- split/kernel_scene_intersect.h
- split/kernel_shader_setup.h
- split/kernel_shader_sort.h
- split/kernel_shader_eval.h
- split/kernel_shadow_blocked_ao.h
- split/kernel_shadow_blocked_dl.h
- split/kernel_split_common.h
- split/kernel_split_data.h
- split/kernel_split_data_types.h
- split/kernel_subsurface_scatter.h
-)
-
set(LIB
)
@@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
# build for each arch
- set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+ set(cuda_sources device/cuda/kernel.cu
${SRC_HEADERS}
- ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_DEVICE_GPU_HEADERS}
+ ${SRC_DEVICE_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
+ ${SRC_INTEGRATOR_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
- set(cuda_filter_sources kernels/cuda/filter.cu
- ${SRC_HEADERS}
- ${SRC_KERNELS_CUDA_HEADERS}
- ${SRC_FILTER_HEADERS}
- ${SRC_UTIL_HEADERS}
- )
set(cuda_cubins)
macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
@@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
endif()
- set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+ set(cuda_kernel_src "/device/cuda/${name}.cu")
set(cuda_flags ${flags}
-D CCL_NAMESPACE_BEGIN=
@@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
-D NVCC
-m ${CUDA_BITS}
-I ${CMAKE_CURRENT_SOURCE_DIR}/..
- -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+ -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
--use_fast_math
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
@@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir)
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE)
- if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
- # Compile split kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE)
- endif()
-
if(WITH_CYCLES_CUDA_BUILD_SERIAL)
set(prev_arch ${arch})
endif()
@@ -547,15 +461,15 @@ endif()
# OptiX PTX modules
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
- macro(CYCLES_OPTIX_KERNEL_ADD name flags)
- set(input "kernels/optix/kernel_optix.cu")
+ macro(CYCLES_OPTIX_KERNEL_ADD name input flags)
set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
set(cuda_flags ${flags}
-I "${OPTIX_INCLUDE_DIR}"
-I "${CMAKE_CURRENT_SOURCE_DIR}/.."
- -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+ -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
--use_fast_math
+ -Wno-deprecated-gpu-targets
-o ${output})
if(WITH_NANOVDB)
@@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
DEPENDS
${input}
${SRC_HEADERS}
- ${SRC_KERNELS_CUDA_HEADERS}
- ${SRC_KERNELS_OPTIX_HEADERS}
+ ${SRC_DEVICE_GPU_HEADERS}
+ ${SRC_DEVICE_CUDA_HEADERS}
+ ${SRC_DEVICE_OPTIX_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
+ ${SRC_INTEGRATOR_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
COMMAND ${CUBIN_CC_ENV}
@@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
DEPENDS
${input}
${SRC_HEADERS}
- ${SRC_KERNELS_CUDA_HEADERS}
- ${SRC_KERNELS_OPTIX_HEADERS}
+ ${SRC_DEVICE_GPU_HEADERS}
+ ${SRC_DEVICE_CUDA_HEADERS}
+ ${SRC_DEVICE_OPTIX_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
+ ${SRC_INTEGRATOR_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
COMMAND
@@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
endmacro()
- CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
- CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
+ CYCLES_OPTIX_KERNEL_ADD(
+ kernel_optix
+ "device/optix/kernel.cu"
+ "")
+ CYCLES_OPTIX_KERNEL_ADD(
+ kernel_optix_shader_raytrace
+ "device/optix/kernel_shader_raytrace.cu"
+ "--keep-device-functions")
add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
cycles_set_solution_folder(cycles_kernel_optix)
@@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN)
endif()
endif()
-set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
- set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
- set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
- set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
- set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
cycles_add_library(cycles_kernel "${LIB}"
- ${SRC_CPU_KERNELS}
- ${SRC_CUDA_KERNELS}
- ${SRC_OPTIX_KERNELS}
- ${SRC_OPENCL_KERNELS}
+ ${SRC_DEVICE_CPU}
+ ${SRC_DEVICE_CUDA}
+ ${SRC_DEVICE_OPTIX}
${SRC_HEADERS}
- ${SRC_KERNELS_CPU_HEADERS}
- ${SRC_KERNELS_CUDA_HEADERS}
- ${SRC_KERNELS_OPTIX_HEADERS}
- ${SRC_KERNELS_OPENCL_HEADERS}
+ ${SRC_DEVICE_CPU_HEADERS}
+ ${SRC_DEVICE_GPU_HEADERS}
+ ${SRC_DEVICE_CUDA_HEADERS}
+ ${SRC_DEVICE_OPTIX_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
- ${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
- ${SRC_SPLIT_HEADERS}
+ ${SRC_INTEGRATOR_HEADERS}
)
source_group("bvh" FILES ${SRC_BVH_HEADERS})
source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
-source_group("filter" FILES ${SRC_FILTER_HEADERS})
source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
source_group("kernel" FILES ${SRC_HEADERS})
-source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
-source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
-source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
-source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
-source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
+source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
+source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
source_group("svm" FILES ${SRC_SVM_HEADERS})
if(WITH_CYCLES_CUDA)
@@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
add_dependencies(cycles_kernel cycles_kernel_optix)
endif()
-# OpenCL kernel
-
-# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-# add_custom_command(
-# OUTPUT ${KERNEL_PREPROCESSED}
-# COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-# DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# Install kernel source for runtime compilation
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
-
if(WITH_NANOVDB)
set(SRC_NANOVDB_HEADERS
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index acf29cf1baf..539e9fd05fb 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,8 @@
* the code has been extended and modified to support more primitives and work
* with CPU/CUDA/OpenCL. */
+#pragma once
+
#ifdef __EMBREE__
# include "kernel/bvh/bvh_embree.h"
#endif
@@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
}
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
const Ray *ray,
const uint visibility,
Intersection *isect)
{
- PROFILING_INIT(kg, PROFILING_INTERSECT);
-
#ifdef __KERNEL_OPTIX__
uint p0 = 0;
uint p1 = 0;
@@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
}
#ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
const Ray *ray,
LocalIntersection *local_isect,
int local_object,
uint *lcg_state,
int max_hits)
{
- PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
-
# ifdef __KERNEL_OPTIX__
uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
@@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
float3 dir = ray->D;
float3 idir = ray->D;
Transform ob_itfm;
- rtc_ray.tfar = bvh_instance_motion_push(
- kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
+ rtc_ray.tfar = ray->t *
+ bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
/* bvh_instance_motion_push() returns the inverse transform but
* it's not needed here. */
(void)ob_itfm;
@@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
#endif
#ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
uint visibility,
uint max_hits,
uint *num_hits)
{
- PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
-
# ifdef __KERNEL_OPTIX__
uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
@@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
ctx.isect_s = isect;
ctx.max_hits = max_hits;
- ctx.num_hits = 0;
IntersectContext rtc_ctx(&ctx);
RTCRay rtc_ray;
kernel_embree_setup_ray(*ray, rtc_ray, visibility);
rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
- if (ctx.num_hits > max_hits) {
- return true;
- }
*num_hits = ctx.num_hits;
- return rtc_ray.tfar == -INFINITY;
+ return ctx.opaque_hit;
}
# endif /* __EMBREE__ */
@@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
#endif /* __SHADOW_RECORD_ALL__ */
#ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
{
- PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
-
# ifdef __KERNEL_OPTIX__
uint p0 = 0;
uint p1 = 0;
@@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
#endif /* __VOLUME__ */
#ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint max_hits,
const uint visibility)
{
- PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
-
if (!scene_intersect_valid(ray)) {
return false;
}
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 4605c3ea51d..092d770dcac 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -14,14 +14,13 @@
* limitations under the License.
*/
+#pragma once
+
#include <embree3/rtcore_ray.h>
#include <embree3/rtcore_scene.h>
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-// clang-format on
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
#include "util/util_vector.h"
@@ -36,25 +35,29 @@ struct CCLIntersectContext {
RAY_VOLUME_ALL = 4,
} RayType;
- KernelGlobals *kg;
+ const KernelGlobals *kg;
RayType type;
/* for shadow rays */
Intersection *isect_s;
int max_hits;
int num_hits;
+ float max_t;
+ bool opaque_hit;
/* for SSS Rays: */
LocalIntersection *local_isect;
int local_object_id;
uint *lcg_state;
- CCLIntersectContext(KernelGlobals *kg_, RayType type_)
+ CCLIntersectContext(const KernelGlobals *kg_, RayType type_)
{
kg = kg_;
type = type_;
max_hits = 1;
num_hits = 0;
+ max_t = FLT_MAX;
+ opaque_hit = false;
isect_s = NULL;
local_isect = NULL;
local_object_id = -1;
@@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
}
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg,
const RTCRay *ray,
const RTCHit *hit,
Intersection *isect)
@@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
isect->type = kernel_tex_fetch(__prim_type, isect->prim);
}
-ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg,
const RTCRay *ray,
const RTCHit *hit,
Intersection *isect,
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 4006c9c1632..90b9f410b29 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -36,7 +36,7 @@ ccl_device
#else
ccl_device_inline
#endif
- bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
const Ray *ray,
LocalIntersection *local_isect,
int local_object,
@@ -74,9 +74,9 @@ ccl_device_inline
if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
#if BVH_FEATURE(BVH_MOTION)
Transform ob_itfm;
- isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+ isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
#else
- isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+ isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
#endif
object = local_object;
}
@@ -196,7 +196,7 @@ ccl_device_inline
return false;
}
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
const Ray *ray,
LocalIntersection *local_isect,
int local_object,
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5367bdb633c..15cd0f22213 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
// 3-vector which might be faster.
-ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg,
int node_addr,
int child)
{
@@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
return space;
}
-ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
@@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
#endif
}
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg,
const float3 P,
const float3 dir,
const float t,
@@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
return tnear <= tfar;
}
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
return mask;
}
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 2e94b1d7c37..0ae36fccf9b 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -36,7 +36,7 @@ ccl_device
#else
ccl_device_inline
#endif
- bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const uint visibility,
@@ -68,10 +68,10 @@ ccl_device_inline
Transform ob_itfm;
#endif
- int num_hits_in_instance = 0;
+ float t_world_to_instance = 1.0f;
*num_hits = 0;
- isect_array->t = tmax;
+ Intersection *isect = isect_array;
/* traversal loop */
do {
@@ -147,13 +147,14 @@ ccl_device_inline
switch (p_type) {
case PRIMITIVE_TRIANGLE: {
- hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+ hit = triangle_intersect(
+ kg, isect, P, dir, isect_t, visibility, object, prim_addr);
break;
}
#if BVH_FEATURE(BVH_MOTION)
case PRIMITIVE_MOTION_TRIANGLE: {
hit = motion_triangle_intersect(
- kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+ kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr);
break;
}
#endif
@@ -163,8 +164,16 @@ ccl_device_inline
case PRIMITIVE_CURVE_RIBBON:
case PRIMITIVE_MOTION_CURVE_RIBBON: {
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
- hit = curve_intersect(
- kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+ hit = curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ isect_t,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type);
break;
}
#endif
@@ -176,27 +185,49 @@ ccl_device_inline
/* shadow ray early termination */
if (hit) {
+ /* Convert intersection distance to world space. */
+ isect->t /= t_world_to_instance;
+
/* detect if this surface has a shader with transparent shadows */
/* todo: optimize so primitive visibility flag indicates if
* the primitive has a transparent shadow shader? */
- const int flags = intersection_get_shader_flags(kg, isect_array);
+ const int flags = intersection_get_shader_flags(kg, isect);
- /* if no transparent shadows, all light is blocked */
- if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
- return true;
- }
- /* if maximum number of hits reached, block all light */
- else if (*num_hits == max_hits) {
+ if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) {
+ /* If no transparent shadows, all light is blocked and we can
+ * stop immediately. */
return true;
}
- /* move on to next entry in intersections array */
- isect_array++;
+ /* Increase the number of hits, possibly beyond max_hits, we will
+ * simply not record those and only keep the max_hits closest. */
(*num_hits)++;
- num_hits_in_instance++;
- isect_array->t = isect_t;
+ if (*num_hits >= max_hits) {
+ /* If maximum number of hits reached, find the intersection with
+ * the largest distance to potentially replace when another hit
+ * is found. */
+ const int num_recorded_hits = min(max_hits, *num_hits);
+ float max_recorded_t = isect_array[0].t;
+ int max_recorded_hit = 0;
+
+ for (int i = 1; i < num_recorded_hits; i++) {
+ if (isect_array[i].t > max_recorded_t) {
+ max_recorded_t = isect_array[i].t;
+ max_recorded_hit = i;
+ }
+ }
+
+ isect = isect_array + max_recorded_hit;
+
+ /* Limit the ray distance and stop counting hits beyond this. */
+ isect_t = max_recorded_t * t_world_to_instance;
+ }
+ else {
+ /* Still have space for intersection, use next hit. */
+ isect = isect + 1;
+ }
}
prim_addr++;
@@ -207,13 +238,14 @@ ccl_device_inline
object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
#if BVH_FEATURE(BVH_MOTION)
- isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+ t_world_to_instance = bvh_instance_motion_push(
+ kg, object, ray, &P, &dir, &idir, &ob_itfm);
#else
- isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+ t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
#endif
- num_hits_in_instance = 0;
- isect_array->t = isect_t;
+ /* Convert intersection to object space. */
+ isect_t *= t_world_to_instance;
++stack_ptr;
kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -228,32 +260,19 @@ ccl_device_inline
kernel_assert(object != OBJECT_NONE);
/* Instance pop. */
- if (num_hits_in_instance) {
- float t_fac;
-
#if BVH_FEATURE(BVH_MOTION)
- bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+ bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
#else
- bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+ bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
#endif
- /* scale isect->t to adjust for instancing */
- for (int i = 0; i < num_hits_in_instance; i++) {
- (isect_array - i - 1)->t *= t_fac;
- }
- }
- else {
-#if BVH_FEATURE(BVH_MOTION)
- bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
- bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
- }
-
- isect_t = tmax;
- isect_array->t = isect_t;
+ /* Restore world space ray length. If max number of hits exceeded this
+ * distance is reduced to recorded only the closest hits. If not use
+ * the original ray length. */
+ isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax;
object = OBJECT_NONE;
+ t_world_to_instance = 1.0f;
node_addr = traversal_stack[stack_ptr];
--stack_ptr;
}
@@ -262,7 +281,7 @@ ccl_device_inline
return false;
}
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const uint visibility,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 89250a8d60a..a26d8c514f3 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -31,7 +31,7 @@
* BVH_MOTION: motion blur rendering
*/
-ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
@@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
case PRIMITIVE_TRIANGLE: {
for (; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
- if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
+ if (triangle_intersect(
+ kg, isect, P, dir, isect->t, visibility, object, prim_addr)) {
/* shadow ray early termination */
if (visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
@@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
for (; prim_addr < prim_addr2; prim_addr++) {
kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
if (motion_triangle_intersect(
- kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
+ kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) {
/* shadow ray early termination */
if (visibility & PATH_RAY_SHADOW_OPAQUE)
return true;
@@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
for (; prim_addr < prim_addr2; prim_addr++) {
const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
- const bool hit = curve_intersect(
- kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+ const bool hit = curve_intersect(kg,
+ isect,
+ P,
+ dir,
+ isect->t,
+ visibility,
+ object,
+ prim_addr,
+ ray->time,
+ curve_type);
if (hit) {
/* shadow ray early termination */
if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
#if BVH_FEATURE(BVH_MOTION)
- isect->t = bvh_instance_motion_push(
- kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+ isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
#else
- isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+ isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
#endif
++stack_ptr;
@@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
return (isect->prim != PRIM_NONE);
}
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 98e6ec25d15..6039e707fc3 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __BVH_TYPES__
-#define __BVH_TYPES__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN
#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
CCL_NAMESPACE_END
-
-#endif /* __BVH_TYPES__ */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index b1faebce957..21384457b16 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
#endif
}
-/* This function should be used to compute a modified ray start position for
- * rays leaving from a surface. The algorithm slightly distorts flat surface
- * of a triangle. Surface is lifted by amount h along normal n in the incident
- * point. */
-
-ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng)
-{
- float3 V[3], N[3];
- triangle_vertices_and_normals(kg, sd->prim, V, N);
-
- const float u = sd->u, v = sd->v;
- const float w = 1 - u - v;
- float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
- float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
-
- object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-
- /* Parabolic approximation */
- float a = dot(N[2] - N[0], V[0] - V[2]);
- float b = dot(N[2] - N[1], V[1] - V[2]);
- float c = dot(N[1] - N[0], V[1] - V[0]);
- float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
-
- /* Check flipped normals */
- if (dot(n, Ng) > 0) {
- /* Local linear envelope */
- float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
- float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
- float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
- h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
- h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
- h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
- h = max(min(min(h0, h1), h2), h * 0.5f);
- }
- else {
- float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
- float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
- float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
- h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
- h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
- h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
- h = min(-min(min(h0, h1), h2), h * 0.5f);
- }
-
- return n * h;
-}
-
-/* Ray offset to avoid shadow terminator artifact. */
-
-ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L)
-{
- float NL = dot(sd->N, L);
- bool transmit = (NL < 0.0f);
- float3 Ng = (transmit ? -sd->Ng : sd->Ng);
- float3 P = ray_offset(sd->P, Ng);
-
- if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
- const float offset_cutoff =
- kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
- /* Do ray offset (heavy stuff) only for close to be terminated triangles:
- * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
- * make a smooth transition near the threshold. */
- if (offset_cutoff > 0.0f) {
- float NgL = dot(Ng, L);
- float offset_amount = 0.0f;
- if (NL < offset_cutoff) {
- offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
- }
- else {
- offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
- }
- if (offset_amount > 0.0f) {
- P += smooth_surface_offset(kg, sd, Ng) * offset_amount;
- }
- }
- }
-
- return P;
-}
-
#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
/* ToDo: Move to another file? */
ccl_device int intersections_compare(const void *a, const void *b)
@@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
}
#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
-/* Utility to quickly get a shader flags from an intersection. */
+/* Utility to quickly get flags from an intersection. */
-ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
- const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg,
+ const Intersection *ccl_restrict isect)
{
const int prim = kernel_tex_fetch(__prim_index, isect->prim);
int shader = 0;
@@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest
return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
}
-ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
- const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_from_isect_prim(
+ const KernelGlobals *ccl_restrict kg, const int isect_prim)
{
- const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+ const int prim = kernel_tex_fetch(__prim_index, isect_prim);
int shader = 0;
#ifdef __HAIR__
- if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+ if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE)
#endif
{
shader = kernel_tex_fetch(__tri_shader, prim);
@@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k
return shader & SHADER_MASK;
}
-ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg,
+ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg,
+ const Intersection *ccl_restrict isect)
+{
+ return intersection_get_shader_from_isect_prim(kg, isect->prim);
+}
+
+ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg,
const Intersection *ccl_restrict isect)
{
if (isect->object != OBJECT_NONE) {
@@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k
return kernel_tex_fetch(__prim_object, isect->prim);
}
+ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg,
+ const Intersection *ccl_restrict isect)
+{
+ const int object = intersection_get_object(kg, isect);
+
+ return kernel_tex_fetch(__object_flag, object);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 1f2ea47269b..0411d9c522d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -35,7 +35,7 @@ ccl_device
#else
ccl_device_inline
#endif
- bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
@@ -147,7 +147,7 @@ ccl_device_inline
if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
- triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+ triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr);
}
break;
}
@@ -165,7 +165,7 @@ ccl_device_inline
continue;
}
motion_triangle_intersect(
- kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+ kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr);
}
break;
}
@@ -181,10 +181,9 @@ ccl_device_inline
int object_flag = kernel_tex_fetch(__object_flag, object);
if (object_flag & SD_OBJECT_HAS_VOLUME) {
#if BVH_FEATURE(BVH_MOTION)
- isect->t = bvh_instance_motion_push(
- kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+ isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
#else
- isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+ isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
#endif
++stack_ptr;
@@ -222,7 +221,7 @@ ccl_device_inline
return (isect->prim != PRIM_NONE);
}
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect,
const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index a8664cc4331..4874270f15d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -35,7 +35,7 @@ ccl_device
#else
ccl_device_inline
#endif
- uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const uint max_hits,
@@ -150,7 +150,8 @@ ccl_device_inline
if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
continue;
}
- hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+ hit = triangle_intersect(
+ kg, isect_array, P, dir, isect_t, visibility, object, prim_addr);
if (hit) {
/* Move on to next entry in intersections array. */
isect_array++;
@@ -190,7 +191,7 @@ ccl_device_inline
continue;
}
hit = motion_triangle_intersect(
- kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+ kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr);
if (hit) {
/* Move on to next entry in intersections array. */
isect_array++;
@@ -228,10 +229,9 @@ ccl_device_inline
int object_flag = kernel_tex_fetch(__object_flag, object);
if (object_flag & SD_OBJECT_HAS_VOLUME) {
#if BVH_FEATURE(BVH_MOTION)
- isect_t = bvh_instance_motion_push(
- kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+ isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
#else
- isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+ isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
#endif
num_hits_in_instance = 0;
@@ -289,7 +289,7 @@ ccl_device_inline
return num_hits;
}
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg,
const Ray *ray,
Intersection *isect_array,
const uint max_hits,
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 99a5a675976..72a8c2ba090 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 6f2f2ebb202..4eb8bcae997 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
// clang-format off
#include "kernel/closure/bsdf_ashikhmin_velvet.h"
#include "kernel/closure/bsdf_diffuse.h"
@@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl
return val;
}
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_inline int bsdf_sample(const KernelGlobals *kg,
ShaderData *sd,
const ShaderClosure *sc,
float randu,
@@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
break;
# endif /* __PRINCIPLED__ */
#endif
-#ifdef __VOLUME__
- case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- label = volume_henyey_greenstein_sample(sc,
- sd->I,
- sd->dI.dx,
- sd->dI.dy,
- randu,
- randv,
- eval,
- omega_in,
- &domega_in->dx,
- &domega_in->dy,
- pdf);
- break;
-#endif
default:
label = LABEL_NONE;
break;
@@ -482,15 +469,16 @@ ccl_device
ccl_device_inline
#endif
float3
- bsdf_eval(KernelGlobals *kg,
+ bsdf_eval(const KernelGlobals *kg,
ShaderData *sd,
const ShaderClosure *sc,
const float3 omega_in,
+ const bool is_transmission,
float *pdf)
{
- float3 eval;
+ float3 eval = zero_float3();
- if (dot(sd->N, omega_in) >= 0.0f) {
+ if (!is_transmission) {
switch (sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
@@ -570,13 +558,7 @@ ccl_device_inline
break;
# endif /* __PRINCIPLED__ */
#endif
-#ifdef __VOLUME__
- case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
- break;
-#endif
default:
- eval = make_float3(0.0f, 0.0f, 0.0f);
break;
}
if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -663,13 +645,7 @@ ccl_device_inline
break;
# endif /* __PRINCIPLED__ */
#endif
-#ifdef __VOLUME__
- case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
- break;
-#endif
default:
- eval = make_float3(0.0f, 0.0f, 0.0f);
break;
}
if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -682,7 +658,7 @@ ccl_device_inline
return eval;
}
-ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness)
{
/* ToDo: do we want to blur volume closures? */
#ifdef __SVM__
@@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
#endif
}
-ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
-{
-#ifdef __SVM__
- switch (a->type) {
- case CLOSURE_BSDF_TRANSPARENT_ID:
- return true;
- case CLOSURE_BSDF_DIFFUSE_ID:
- case CLOSURE_BSDF_BSSRDF_ID:
- case CLOSURE_BSDF_TRANSLUCENT_ID:
- return bsdf_diffuse_merge(a, b);
- case CLOSURE_BSDF_OREN_NAYAR_ID:
- return bsdf_oren_nayar_merge(a, b);
- case CLOSURE_BSDF_REFLECTION_ID:
- case CLOSURE_BSDF_REFRACTION_ID:
- case CLOSURE_BSDF_MICROFACET_GGX_ID:
- case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
- case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
- case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
- case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
- case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
- case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
- case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
- case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
- return bsdf_microfacet_merge(a, b);
- case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- return bsdf_ashikhmin_velvet_merge(a, b);
- case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- case CLOSURE_BSDF_GLOSSY_TOON_ID:
- return bsdf_toon_merge(a, b);
- case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- return bsdf_hair_merge(a, b);
-# ifdef __PRINCIPLED__
- case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
- case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
- return bsdf_principled_diffuse_merge(a, b);
-# endif
-# ifdef __VOLUME__
- case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- return volume_henyey_greenstein_merge(a, b);
-# endif
- default:
- return false;
- }
-#else
- return false;
-#endif
-}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9814a7cf5c9..be6383e521a 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -14,20 +14,19 @@
* limitations under the License.
*/
-#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
-#define __BSDF_ASHIKHMIN_SHIRLEY_H__
-
/*
-ASHIKHMIN SHIRLEY BSDF
-
-Implementation of
-Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
-
-The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
-the case with all other microfacet-based BSDF implementations in Cycles.
+ * ASHIKHMIN SHIRLEY BSDF
+ *
+ * Implementation of
+ * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+ *
+ * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+ * the case with all other microfacet-based BSDF implementations in Cycles.
+ *
+ * Other than that, the implementation directly follows the paper.
+ */
-Other than that, the implementation directly follows the paper.
-*/
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d3f20edab3..f51027f5701 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -30,8 +30,9 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_ASHIKHMIN_VELVET_H__
-#define __BSDF_ASHIKHMIN_VELVET_H__
+#pragma once
+
+#include "kernel/kernel_montecarlo.h"
CCL_NAMESPACE_BEGIN
@@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a;
- const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma);
-}
-
ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
@@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ea604ed0311..1555aa30304 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_DIFFUSE_H__
-#define __BSDF_DIFFUSE_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a;
- const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N));
-}
-
ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
@@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa62c1c7ceb..b06dd196b9e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_DIFFUSE_RAMP_H__
-#define __BSDF_DIFFUSE_RAMP_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc,
#endif /* __OSL__ */
CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7ca9424b815..f56f78aa1f0 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_HAIR_H__
-#define __BSDF_HAIR_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const HairBsdf *bsdf_a = (const HairBsdf *)a;
- const HairBsdf *bsdf_b = (const HairBsdf *)b;
-
- return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) &&
- (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset);
-}
-
ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
@@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f12661b3095..bfe56e5ab0e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -14,15 +14,14 @@
* limitations under the License.
*/
+#pragma once
+
#ifdef __KERNEL_CPU__
# include <fenv.h>
#endif
#include "kernel/kernel_color.h"
-#ifndef __BSDF_HAIR_PRINCIPLED_H__
-# define __BSDF_HAIR_PRINCIPLED_H__
-
CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledHairExtra {
@@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering(
}
/* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c)
+ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c)
{
return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
}
-# ifdef __HAIR__
+#ifdef __HAIR__
/* Set up the hair closure. */
ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf)
{
@@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
}
-# endif /* __HAIR__ */
+#endif /* __HAIR__ */
/* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap)
+ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap)
{
/* Primary specular (R). */
Ap[0] = make_float4(f, f, f, f);
@@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
}
/* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
+ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg,
const ShaderData *sd,
const ShaderClosure *sc,
const float3 omega_in,
@@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
}
/* Sampling function for the hair shader. */
-ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
+ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg,
const ShaderClosure *sc,
ShaderData *sd,
float randu,
@@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
*omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
-# ifdef __RAY_DIFFERENTIALS__
+#ifdef __RAY_DIFFERENTIALS__
float3 N = safe_normalize(sd->I + *omega_in);
*domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
*domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-# endif
+#endif
return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
}
@@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
}
-ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc)
{
PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
@@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index af03bab39f7..227cb448b47 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -30,8 +30,10 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_MICROFACET_H__
-#define __BSDF_MICROFACET_H__
+#pragma once
+
+#include "kernel/kernel_lookup_table.h"
+#include "kernel/kernel_random.h"
CCL_NAMESPACE_BEGIN
@@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i
/* Beckmann and GGX microfacet importance sampling. */
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
+ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg,
const float cos_theta_i,
const float sin_theta_i,
float randu,
@@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
}
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg,
+ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg,
const float3 omega_i,
const float alpha_x,
const float alpha_y,
@@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a;
- const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
- (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
- (bsdf_a->ior == bsdf_b->ior) &&
- ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
- ((bsdf_a->extra && bsdf_b->extra) &&
- (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
- (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
- (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
-}
-
ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
{
bsdf->extra = NULL;
@@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc,
return make_float3(out, out, out);
}
-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg,
const ShaderClosure *sc,
float3 Ng,
float3 I,
@@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
return make_float3(out, out, out);
}
-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg,
const ShaderClosure *sc,
float3 Ng,
float3 I,
@@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 9795c8da065..68d5071dbce 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Most of the code is based on the supplemental implementations from
@@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
bsdf->extra->cspec0);
}
-ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg,
const ShaderClosure *sc,
float3 Ng,
float3 I,
@@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
bsdf->extra->cspec0);
}
-ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg,
const ShaderClosure *sc,
float3 Ng,
float3 I,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 41e5736bf49..be12d47f0ea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __BSDF_OREN_NAYAR_H__
-#define __BSDF_OREN_NAYAR_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a;
- const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness);
-}
-
ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
@@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index cf5484383f2..43f8cf71c59 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_PHONG_RAMP_H__
-#define __BSDF_PHONG_RAMP_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc,
#endif /* __OSL__ */
CCL_NAMESPACE_END
-
-#endif /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d5d012068ff..a72af519482 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -14,14 +14,15 @@
* limitations under the License.
*/
-#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
-#define __BSDF_PRINCIPLED_DIFFUSE_H__
+#pragma once
/* DISNEY PRINCIPLED DIFFUSE BRDF
*
* Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
*/
+#include "kernel/closure/bsdf_util.h"
+
CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledDiffuseBsdf {
@@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a;
- const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
-}
-
ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc,
const float3 I,
const float3 omega_in,
@@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3707de29d73..60ce7e4eb75 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -14,14 +14,15 @@
* limitations under the License.
*/
-#ifndef __BSDF_PRINCIPLED_SHEEN_H__
-#define __BSDF_PRINCIPLED_SHEEN_H__
+#pragma once
/* DISNEY PRINCIPLED SHEEN BRDF
*
* Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
*/
+#include "kernel/closure/bsdf_util.h"
+
CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledSheenBsdf {
@@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c24ba170915..31283971d5a 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_REFLECTION_H__
-#define __BSDF_REFLECTION_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index d4fbe86dac0..cfedb5dfe2c 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_REFRACTION_H__
-#define __BSDF_REFRACTION_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index cc5de21ed0e..acdafe0f735 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_TOON_H__
-#define __BSDF_TOON_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
-ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const ToonBsdf *bsdf_a = (const ToonBsdf *)a;
- const ToonBsdf *bsdf_b = (const ToonBsdf *)b;
-
- return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) &&
- (bsdf_a->smooth == bsdf_b->smooth);
-}
-
ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
{
float is;
@@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 4e5513499e8..f1dc7efb345 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_TRANSPARENT_H__
-#define __BSDF_TRANSPARENT_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc,
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a73dee1b045..beec5f768a1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __BSDF_UTIL_H__
-#define __BSDF_UTIL_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
}
CCL_NAMESPACE_END
-
-#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 562daf1286d..e095314678a 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_BSSRDF_H__
-#define __KERNEL_BSSRDF_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf {
float3 radius;
float3 albedo;
- float sharpness;
- float texture_blur;
float roughness;
- float channels;
+ float anisotropy;
} Bssrdf;
static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
-/* Planar Truncated Gaussian
- *
- * Note how this is different from the typical gaussian, this one integrates
- * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky
- * that integrating x*exp(-x) gives a nice closed form solution. */
-
-/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
-#define GAUSS_TRUNCATE 12.46f
-
-ccl_device float bssrdf_gaussian_eval(const float radius, float r)
-{
- /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
- * = 1 - exp(-Rm*Rm/(2*v)) */
- const float v = radius * radius * (0.25f * 0.25f);
- const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
- if (r >= Rm)
- return 0.0f;
-
- return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v);
-}
-
-ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
+ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA)
{
- /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
- const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
- return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated));
+ float s = sqrtf(3.0f * (1.0f - alpha_prime));
+ return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s);
}
-ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
+ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA)
{
- /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
- * r = sqrt(-2*v*logf(xi)) */
- const float v = radius * radius * (0.25f * 0.25f);
- const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
- /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
- const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
- /* r(xi) */
- const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated);
- *r = sqrtf(r_squared);
-
- /* h^2 + r^2 = Rm^2 */
- *h = safe_sqrtf(Rm * Rm - r_squared);
-}
-
-/* Planar Cubic BSSRDF falloff
- *
- * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
- * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
- * far as I can tell has no closed form solution. So we get an iterative solution
- * instead with newton-raphson. */
-
-ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
-{
- if (sharpness == 0.0f) {
- const float Rm = radius;
-
- if (r >= Rm)
- return 0.0f;
-
- /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
- const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
- const float f = Rm - r;
- const float num = f * f * f;
-
- return (10.0f * num) / (Rm5 * M_PI_F);
+ /* Little Newton solver. */
+ if (rd < 1e-4f) {
+ return 0.0f;
+ }
+ if (rd >= 0.995f) {
+ return 0.999999f;
}
- else {
- float Rm = radius * (1.0f + sharpness);
-
- if (r >= Rm)
- return 0.0f;
- /* custom variation with extra sharpness, to match the previous code */
- const float y = 1.0f / (1.0f + sharpness);
- float Rmy, ry, ryinv;
+ float x0 = 0.0f;
+ float x1 = 1.0f;
+ float xmid, fmid;
- if (sharpness == 1.0f) {
- Rmy = sqrtf(Rm);
- ry = sqrtf(r);
- ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f;
+ constexpr const int max_num_iterations = 12;
+ for (int i = 0; i < max_num_iterations; ++i) {
+ xmid = 0.5f * (x0 + x1);
+ fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA);
+ if (fmid < rd) {
+ x0 = xmid;
}
else {
- Rmy = powf(Rm, y);
- ry = powf(r, y);
- ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
+ x1 = xmid;
}
-
- const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
- const float f = Rmy - ry;
- const float num = f * (f * f) * (y * ryinv);
-
- return (10.0f * num) / (Rmy5 * M_PI_F);
- }
-}
-
-ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
-{
- return bssrdf_cubic_eval(radius, sharpness, r);
-}
-
-/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
-{
- /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
- * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
- * should not be too bad */
- const float tolerance = 1e-6f;
- const int max_iteration_count = 10;
- float x = 0.25f;
- int i;
-
- for (i = 0; i < max_iteration_count; i++) {
- float x2 = x * x;
- float x3 = x2 * x;
- float nx = (1.0f - x);
-
- float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
- float f_ = 20.0f * (x * nx) * (nx * nx);
-
- if (fabsf(f) < tolerance || f_ == 0.0f)
- break;
-
- x = saturate(x - f / f_);
}
- return x;
+ return xmid;
}
-ccl_device void bssrdf_cubic_sample(
- const float radius, const float sharpness, float xi, float *r, float *h)
+ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta)
{
- float Rm = radius;
- float r_ = bssrdf_cubic_quintic_root_find(xi);
-
- if (sharpness != 0.0f) {
- r_ = powf(r_, 1.0f + sharpness);
- Rm *= (1.0f + sharpness);
- }
-
- r_ *= Rm;
- *r = r_;
-
- /* h^2 + r^2 = Rm^2 */
- *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Approximate Reflectance Profiles
- * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
- */
-
-/* This is a bit arbitrary, just need big enough radius so it matches
- * the mean free length, but still not too big so sampling is still
- * effective. Might need some further tweaks.
- */
-#define BURLEY_TRUNCATE 16.0f
-#define BURLEY_TRUNCATE_CDF 0.9963790093708328f // cdf(BURLEY_TRUNCATE)
-
-ccl_device_inline float bssrdf_burley_fitting(float A)
-{
- /* Diffuse surface transmission, equation (6). */
- return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-}
-
-/* Scale mean free path length so it gives similar looking result
- * to Cubic and Gaussian models.
- */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
-{
- return 0.25f * M_1_PI_F * r;
-}
-
-ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
-{
- /* Mean free path length. */
- const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
- /* Surface albedo. */
- const float3 A = bssrdf->albedo;
- const float3 s = make_float3(
- bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
-
- bssrdf->radius = l / s;
-}
-
-ccl_device float bssrdf_burley_eval(const float d, float r)
-{
- const float Rm = BURLEY_TRUNCATE * d;
-
- if (r >= Rm)
- return 0.0f;
-
- /* Burley reflectance profile, equation (3).
- *
- * NOTES:
- * - Surface albedo is already included into sc->weight, no need to
- * multiply by this term here.
- * - This is normalized diffuse model, so the equation is multiplied
- * by 2*pi, which also matches cdf().
- */
- float exp_r_3_d = expf(-r / (3.0f * d));
- float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d;
- return (exp_r_d + exp_r_3_d) / (4.0f * d);
-}
-
-ccl_device float bssrdf_burley_pdf(const float d, float r)
-{
- return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF);
-}
-
-/* Find the radius for desired CDF value.
- * Returns scaled radius, meaning the result is to be scaled up by d.
- * Since there's no closed form solution we do Newton-Raphson method to find it.
- */
-ccl_device_forceinline float bssrdf_burley_root_find(float xi)
-{
- const float tolerance = 1e-6f;
- const int max_iteration_count = 10;
- /* Do initial guess based on manual curve fitting, this allows us to reduce
- * number of iterations to maximum 4 across the [0..1] range. We keep maximum
- * number of iteration higher just to be sure we didn't miss root in some
- * corner case.
- */
- float r;
- if (xi <= 0.9f) {
- r = expf(xi * xi * 2.4f) - 1.0f;
+ if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) {
+ /* Scale mean free path length so it gives similar looking result to older
+ * Cubic, Gaussian and Burley models. */
+ bssrdf->radius *= 0.25f * M_1_PI_F;
}
else {
- /* TODO(sergey): Some nicer curve fit is possible here. */
- r = 15.0f;
- }
- /* Solve against scaled radius. */
- for (int i = 0; i < max_iteration_count; i++) {
- float exp_r_3 = expf(-r / 3.0f);
- float exp_r = exp_r_3 * exp_r_3 * exp_r_3;
- float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi;
- float f_ = 0.25f * exp_r + 0.25f * exp_r_3;
+ /* Adjust radius based on IOR and albedo. */
+ const float inv_eta = 1.0f / eta;
+ const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta;
+ const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
+ (1.0f - F_dr); /* From Jensen's `Fdr` ratio formula. */
- if (fabsf(f) < tolerance || f_ == 0.0f) {
- break;
- }
+ const float3 alpha_prime = make_float3(
+ bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
+ bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
+ bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
- r = r - f / f_;
- if (r < 0.0f) {
- r = 0.0f;
- }
+ bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
}
- return r;
}
-ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h)
-{
- const float Rm = BURLEY_TRUNCATE * d;
- const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
-
- *r = r_;
-
- /* h^2 + r^2 = Rm^2 */
- *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* None BSSRDF falloff
- *
- * Samples distributed over disk with no falloff, for reference. */
-
-ccl_device float bssrdf_none_eval(const float radius, float r)
-{
- const float Rm = radius;
- return (r < Rm) ? 1.0f : 0.0f;
-}
-
-ccl_device float bssrdf_none_pdf(const float radius, float r)
-{
- /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
- const float Rm = radius;
- const float area = (M_PI_F * Rm * Rm);
-
- return bssrdf_none_eval(radius, r) / area;
-}
-
-ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
-{
- /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
- * r = sqrt(xi)*Rm */
- const float Rm = radius;
- const float r_ = sqrtf(xi) * Rm;
-
- *r = r_;
-
- /* h^2 + r^2 = Rm^2 */
- *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Generic */
+/* Setup */
ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
{
@@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
}
-ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior)
{
int flag = 0;
int bssrdf_channels = 3;
@@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
if (bssrdf_channels < 3) {
/* Add diffuse BSDF if any radius too small. */
#ifdef __PRINCIPLED__
- if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
+ if (bssrdf->roughness != FLT_MAX) {
float roughness = bssrdf->roughness;
float3 N = bssrdf->N;
@@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
/* Setup BSSRDF if radius is large enough. */
if (bssrdf_channels > 0) {
bssrdf->type = type;
- bssrdf->channels = bssrdf_channels;
- bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
- bssrdf->texture_blur = saturate(bssrdf->texture_blur);
- bssrdf->sharpness = saturate(bssrdf->sharpness);
+ bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels;
- if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
- type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
- type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
- bssrdf_burley_setup(bssrdf);
- }
+ bssrdf_setup_radius(bssrdf, type, ior);
flag |= SD_BSSRDF;
}
@@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
return flag;
}
-ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
-{
- const Bssrdf *bssrdf = (const Bssrdf *)sc;
- float radius;
-
- /* Sample color channel and reuse random number. Only a subset of channels
- * may be used if their radius was too small to handle as BSSRDF. */
- xi *= bssrdf->channels;
-
- if (xi < 1.0f) {
- radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x :
- (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
- bssrdf->radius.z;
- }
- else if (xi < 2.0f) {
- xi -= 1.0f;
- radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
- bssrdf->radius.z;
- }
- else {
- xi -= 2.0f;
- radius = bssrdf->radius.z;
- }
-
- /* Sample BSSRDF. */
- if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
- bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
- }
- else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
- bssrdf_gaussian_sample(radius, xi, r, h);
- }
- else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
- * bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
- bssrdf_burley_sample(radius, xi, r, h);
- }
-}
-
-ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
-{
- if (radius == 0.0f) {
- return 0.0f;
- }
- else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
- return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
- }
- else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
- return bssrdf_gaussian_pdf(radius, r);
- }
- else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
- * bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
- return bssrdf_burley_pdf(radius, r);
- }
-}
-
-ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
-{
- const Bssrdf *bssrdf = (const Bssrdf *)sc;
-
- return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
- bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
- bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
-}
-
-ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
-{
- const Bssrdf *bssrdf = (const Bssrdf *)sc;
- float3 pdf = bssrdf_eval(sc, r);
-
- return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
-}
-
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 911382e6865..a2519d97618 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -30,6 +30,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* BACKGROUND CLOSURE */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 1430f712701..69959a3f21b 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __VOLUME_H__
-#define __VOLUME_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
return SD_SCATTER;
}
-ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
- const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a;
- const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b;
-
- return (volume_a->g == volume_b->g);
-}
-
-ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc,
+ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc,
const float3 I,
float3 omega_in,
float *pdf)
{
- const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
- float g = volume->g;
+ float g = svc->g;
/* note that I points towards the viewer */
if (fabsf(g) < 1e-3f) {
@@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd
return dir;
}
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
+ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc,
float3 I,
float3 dIdx,
float3 dIdy,
@@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
float3 *domega_in_dy,
float *pdf)
{
- const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
- float g = volume->g;
+ float g = svc->g;
/* note that I points towards the viewer and so is used negated */
*omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
@@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
/* VOLUME CLOSURE */
ccl_device float3 volume_phase_eval(const ShaderData *sd,
- const ShaderClosure *sc,
+ const ShaderVolumeClosure *svc,
float3 omega_in,
float *pdf)
{
- kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
-
- return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+ return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
}
ccl_device int volume_phase_sample(const ShaderData *sd,
- const ShaderClosure *sc,
+ const ShaderVolumeClosure *svc,
float randu,
float randv,
float3 *eval,
@@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd,
differential3 *domega_in,
float *pdf)
{
- int label;
-
- switch (sc->type) {
- case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- label = volume_henyey_greenstein_sample(sc,
- sd->I,
- sd->dI.dx,
- sd->dI.dy,
- randu,
- randv,
- eval,
- omega_in,
- &domega_in->dx,
- &domega_in->dy,
- pdf);
- break;
- default:
- *eval = make_float3(0.0f, 0.0f, 0.0f);
- label = LABEL_NONE;
- break;
+ return volume_henyey_greenstein_sample(svc,
+ sd->I,
+ sd->dI.dx,
+ sd->dI.dy,
+ randu,
+ randv,
+ eval,
+ omega_in,
+ &domega_in->dx,
+ &domega_in->dy,
+ pdf);
+}
+
+/* Volume sampling utilities. */
+
+/* todo: this value could be tweaked or turned into a probability to avoid
+ * unnecessary work in volumes and subsurface scattering. */
+#define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+{
+ return exp3(-sigma * t);
+}
+
+ccl_device float volume_channel_get(float3 value, int channel)
+{
+ return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+}
+
+ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+ /* Sample color channel proportional to throughput and single scattering
+ * albedo, to significantly reduce noise with many bounce, following:
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+ float3 weights = fabs(throughput * albedo);
+ float sum_weights = weights.x + weights.y + weights.z;
+ float3 weights_pdf;
+
+ if (sum_weights > 0.0f) {
+ weights_pdf = weights / sum_weights;
}
+ else {
+ weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+ }
+
+ *pdf = weights_pdf;
- return label;
+ /* OpenCL does not support -> on float3, so don't use pdf->x. */
+ if (rand < weights_pdf.x) {
+ return 0;
+ }
+ else if (rand < weights_pdf.x + weights_pdf.y) {
+ return 1;
+ }
+ else {
+ return 2;
+ }
}
CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h
index 88f6a264a5a..bfd936c7bbd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_COMPAT_CPU_H__
-#define __KERNEL_COMPAT_CPU_H__
+#pragma once
#define __KERNEL_CPU__
@@ -27,14 +26,6 @@
# pragma GCC diagnostic ignored "-Wuninitialized"
#endif
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-# define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
#include "util/util_half.h"
#include "util/util_math.h"
#include "util/util_simd.h"
@@ -43,15 +34,6 @@
#define ccl_addr_space
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
* much slower than the double version. This was fixed in glibc 2.16.
*/
@@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN
* simple arrays and after inlining fetch hopefully revert to being a simple
* pointer lookup. */
template<typename T> struct texture {
- ccl_always_inline const T &fetch(int index)
+ ccl_always_inline const T &fetch(int index) const
{
kernel_assert(index >= 0 && index < width);
return data[index];
}
-#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
- /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
- * compatibility with existing indices and data structures.
- */
- ccl_always_inline avxf fetch_avxf(const int index)
- {
- kernel_assert(index >= 0 && (index + 1) < width);
- ssef *ssef_data = (ssef *)data;
- ssef *ssef_node_data = &ssef_data[index];
- return _mm256_loadu_ps((float *)ssef_node_data);
- }
-#endif
-
-#ifdef __KERNEL_SSE2__
- ccl_always_inline ssef fetch_ssef(int index)
- {
- kernel_assert(index >= 0 && index < width);
- return ((ssef *)data)[index];
- }
-
- ccl_always_inline ssei fetch_ssei(int index)
- {
- kernel_assert(index >= 0 && index < width);
- return ((ssei *)data)[index];
- }
-#endif
T *data;
int width;
@@ -110,15 +66,6 @@ template<typename T> struct texture {
/* Macros to handle different memory storage on different devices */
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
-#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
-#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
-#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_array(tex) (kg->tex.data)
-
-#define kernel_data (kg->__data)
-
#ifdef __KERNEL_SSE2__
typedef vector3<sseb> sse3b;
typedef vector3<ssef> sse3f;
@@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f;
#endif
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+ KernelData __data;
+
+#ifdef __OSL__
+ /* On the CPU, we also have the OSL globals here. Most data structures are shared
+ * with SVM, the difference is in the shaders and object/mesh attributes. */
+ OSLGlobals *osl;
+ OSLShadingSystem *osl_ss;
+ OSLThreadData *osl_tdata;
+#endif
+
+ /* **** Run-time data **** */
+
+ ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h
index 59b96c86c50..57e81ab186d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_CPU_IMAGE_H__
-#define __KERNEL_CPU_IMAGE_H__
+#pragma once
#ifdef WITH_NANOVDB
# define NANOVDB_USE_INTRINSICS
@@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
#undef SET_CUBIC_SPLINE_WEIGHTS
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
{
const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
@@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
}
}
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
int id,
float3 P,
InterpolationType interp)
@@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
} /* Namespace. */
CCL_NAMESPACE_END
-
-#endif // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 8040bfb7b33..ac1cdf5fffe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -56,9 +56,9 @@
/* do nothing */
#endif
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index b907c6a2bac..ae2a841835a 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -14,50 +14,49 @@
* limitations under the License.
*/
-#ifndef __KERNEL_H__
-#define __KERNEL_H__
+#pragma once
/* CPU Kernel Interface */
-#include "kernel/kernel_types.h"
#include "util/util_types.h"
+#include "kernel/kernel_types.h"
+
CCL_NAMESPACE_BEGIN
#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+struct IntegratorStateCPU;
struct KernelGlobals;
struct KernelData;
KernelGlobals *kernel_globals_create();
void kernel_globals_free(KernelGlobals *kg);
-void *kernel_osl_memory(KernelGlobals *kg);
-bool kernel_osl_use(KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+# include "kernel/device/cpu/globals.h"
+# include "kernel/device/cpu/image.h"
+
+# include "kernel/integrator/integrator_state.h"
+# include "kernel/integrator/integrator_state_flow.h"
+# include "kernel/integrator/integrator_state_util.h"
+
+# include "kernel/integrator/integrator_init_from_camera.h"
+# include "kernel/integrator/integrator_init_from_bake.h"
+# include "kernel/integrator/integrator_intersect_closest.h"
+# include "kernel/integrator/integrator_intersect_shadow.h"
+# include "kernel/integrator/integrator_intersect_subsurface.h"
+# include "kernel/integrator/integrator_intersect_volume_stack.h"
+# include "kernel/integrator/integrator_shade_background.h"
+# include "kernel/integrator/integrator_shade_light.h"
+# include "kernel/integrator/integrator_shade_shadow.h"
+# include "kernel/integrator/integrator_shade_surface.h"
+# include "kernel/integrator/integrator_shade_volume.h"
+# include "kernel/integrator/integrator_megakernel.h"
+
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_adaptive_sampling.h"
+# include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+# define STUB_ASSERT(arch, name) \
+ assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state) \
+ { \
+ KERNEL_INVOKE(name, kg, state); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, state, render_buffer); \
+ }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer) \
+ { \
+ return KERNEL_INVOKE( \
+ name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+ kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+ kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+ return false;
+#else
+ return kernel_adaptive_sampling_convergence_check(
+ kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+ kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+ kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+ kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+# ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, bake);
+# else
+# ifdef __BAKING__
+ kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+# endif
+# endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 5f6b6800363..220768036ab 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -34,6 +34,6 @@
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 97e8fc25140..90c05113cbe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -35,6 +35,6 @@
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index 26d7fd4de48..fb85ef5b0d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -29,6 +29,6 @@
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 3f259aa4480..87baf04258a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -31,6 +31,6 @@
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index 68bae8c07c6..bb421d58815 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -32,6 +32,6 @@
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h
index ea3b78b7cef..3c85a8e7bd2 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -14,20 +14,15 @@
* limitations under the License.
*/
-#ifndef __KERNEL_COMPAT_CUDA_H__
-#define __KERNEL_COMPAT_CUDA_H__
+#pragma once
#define __KERNEL_GPU__
#define __KERNEL_CUDA__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-# define __NODES_FEATURES__ NODE_FEATURE_ALL
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
#endif
/* Manual definitions so we can compile without CUDA toolkit. */
@@ -38,8 +33,6 @@ typedef unsigned long long uint64_t;
#else
# include <stdint.h>
#endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
#ifdef CYCLES_CUBIN_CC
# define FLT_MIN 1.175494350822287507969e-38f
@@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject;
# define FLT_EPSILON 1.192092896e-07F
#endif
-__device__ half __float2half(const float f)
-{
- half val;
- asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
- return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
#define ccl_device __device__ __inline__
#if __CUDA_ARCH__ < 500
@@ -68,104 +54,61 @@ __device__ half __float2half(const float f)
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
#define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
#define ccl_align(n) __align__(n)
#define ccl_optional_struct_init
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH)
-
/* No assert supported for CUDA */
#define kernel_assert(cond)
-/* Types */
+/* GPU thread, block, grid size and index */
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-ccl_device_inline uint ccl_local_id(uint d)
-{
- switch (d) {
- case 0:
- return threadIdx.x;
- case 1:
- return threadIdx.y;
- case 2:
- return threadIdx.z;
- default:
- return 0;
- }
-}
+/* GPU warp synchronization. */
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
-ccl_device_inline uint ccl_local_size(uint d)
-{
- switch (d) {
- case 0:
- return blockDim.x;
- case 1:
- return blockDim.y;
- case 2:
- return blockDim.z;
- default:
- return 0;
- }
-}
+/* GPU texture objects */
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y)
{
- switch (d) {
- case 0:
- return blockIdx.x;
- case 1:
- return blockIdx.y;
- case 2:
- return blockIdx.z;
- default:
- return 0;
- }
+ return tex2D<T>(texobj, x, y);
}
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y,
+ const float z)
{
- switch (d) {
- case 0:
- return gridDim.x;
- case 1:
- return gridDim.y;
- case 2:
- return gridDim.z;
- default:
- return 0;
- }
+ return tex3D<T>(texobj, x, y, z);
}
-/* Textures */
-
-/* Use arrays for regular data. */
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-
-#define kernel_data __data
-
/* Use fast math functions */
#define cosf(x) __cosf(((float)(x)))
@@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d)
#define logf(x) __logf(((float)(x)))
#define expf(x) __expf(((float)(x)))
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+ half val;
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+ return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ * used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+# define GPU_KERNEL_MAX_REGISTERS 64
+# else
+# define GPU_KERNEL_MAX_REGISTERS 48
+# endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 512
+# define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads, \
+ GPU_MULTIPRESSOR_MAX_REGISTERS / \
+ (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+ GPU_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+ int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/device/cuda/kernel.cu
index 84938b889e5..e26fe243642 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
+/* CUDA kernel entry points */
-#define KERNEL_NAME indirect_subsurface
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#ifdef __CUDA_ARCH__
+# include "kernel/device/cuda/compat.h"
+# include "kernel/device/cuda/config.h"
+# include "kernel/device/cuda/globals.h"
+
+# include "kernel/device/gpu/image.h"
+# include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h
index 132653fa7ca..b015c78a8f5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -14,6 +14,10 @@
* limitations under the License.
*/
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
#ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
@@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a)
/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
{
- CUtexObject tex = (CUtexObject)info.data;
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
x = (x * info.width) - 0.5f;
y = (y * info.height) - 0.5f;
@@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
- return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
- cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
+ return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+ g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+ cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+ g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
}
/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
{
- CUtexObject tex = (CUtexObject)info.data;
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
x = (x * info.width) - 0.5f;
y = (y * info.height) - 0.5f;
@@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x,
float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
- return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
- g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
- g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
- g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
+ return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+ g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+ g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+ g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
}
#ifdef WITH_NANOVDB
@@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
}
template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
const TextureInfo &info, float x, float y, float z, uint interpolation)
{
using namespace nanovdb;
@@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb(
}
#endif
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
{
const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
@@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
return kernel_tex_image_interp_bicubic<float4>(info, x, y);
}
else {
- CUtexObject tex = (CUtexObject)info.data;
- return tex2D<float4>(tex, x, y);
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
}
}
/* float, byte and half */
@@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
f = kernel_tex_image_interp_bicubic<float>(info, x, y);
}
else {
- CUtexObject tex = (CUtexObject)info.data;
- f = tex2D<float>(tex, x, y);
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
}
return make_float4(f, f, f, 1.0f);
}
}
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
int id,
float3 P,
InterpolationType interp)
@@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
}
else {
- CUtexObject tex = (CUtexObject)info.data;
- return tex3D<float4>(tex, x, y, z);
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
}
}
else {
@@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
}
else {
- CUtexObject tex = (CUtexObject)info.data;
- f = tex3D<float>(tex, x, y, z);
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
}
return make_float4(f, f, f, 1.0f);
}
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_reset(int num_states)
+{
+ const int state = ccl_gpu_global_id_x();
+
+ if (state < num_states) {
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+ const int num_tiles,
+ float *render_buffer,
+ const int max_tile_work_size)
+{
+ const int work_index = ccl_gpu_global_id_x();
+
+ if (work_index >= max_tile_work_size * num_tiles) {
+ return;
+ }
+
+ const int tile_index = work_index / max_tile_work_size;
+ const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+ const KernelWorkTile *tile = &tiles[tile_index];
+
+ if (tile_work_index >= tile->work_size) {
+ return;
+ }
+
+ const int state = tile->path_index_offset + tile_work_index;
+
+ uint x, y, sample;
+ get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+ const int num_tiles,
+ float *render_buffer,
+ const int max_tile_work_size)
+{
+ const int work_index = ccl_gpu_global_id_x();
+
+ if (work_index >= max_tile_work_size * num_tiles) {
+ return;
+ }
+
+ const int tile_index = work_index / max_tile_work_size;
+ const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+ const KernelWorkTile *tile = &tiles[tile_index];
+
+ if (tile_work_index >= tile->work_size) {
+ return;
+ }
+
+ const int state = tile->path_index_offset + tile_work_index;
+
+ uint x, y, sample;
+ get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_closest(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_shadow(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_subsurface(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_volume_stack(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_background(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_background(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_light(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_light(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_shadow(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_surface(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_surface(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_surface_raytrace(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_volume(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_volume(NULL, state, render_buffer);
+ }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_queued_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int kernel)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [kernel](const int state) {
+ return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int kernel)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [kernel](const int state) {
+ return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [](const int state) {
+ return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+ (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_terminated_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int indices_offset)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, [](const int state) {
+ return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+ (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_sorted_paths_array(
+ int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+ gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+ return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+ INTEGRATOR_STATE(path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int num_active_paths)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [num_active_paths](const int state) {
+ return (state >= num_active_paths) &&
+ ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+ (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int from_state = active_terminated_states[active_states_offset + global_index];
+ const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+ integrator_state_move(to_state, from_state);
+ }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_prefix_sum(int *values, int num_values)
+{
+ gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ uint *num_active_pixels)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / sw;
+ const int x = work_index - y * sw;
+
+ bool converged = true;
+
+ if (x < sw && y < sh) {
+ converged = kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ }
+
+ /* NOTE: All threads specified in the mask must execute the intrinsic. */
+ const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+ if (lane_id == 0) {
+ atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_filter_x(
+ float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ const int y = ccl_gpu_global_id_x();
+
+ if (y < sh) {
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_filter_y(
+ float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ const int x = ccl_gpu_global_id_x();
+
+ if (x < sw) {
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+ const int pixel_index = ccl_gpu_global_id_x();
+
+ if (pixel_index < num_pixels) {
+ kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+ float *pixels,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int dst_offset,
+ int dst_stride,
+ const Processor &processor)
+{
+ const int render_pixel_index = ccl_gpu_global_id_x();
+ if (render_pixel_index >= num_pixels) {
+ return;
+ }
+
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+ ccl_global const float *buffer = render_buffer + render_buffer_offset;
+ ccl_global float *pixel = pixels +
+ (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+ processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ const int render_pixel_index = ccl_gpu_global_id_x();
+ if (render_pixel_index >= num_pixels) {
+ return;
+ }
+
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+ ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+ float pixel[4];
+ processor(kfilm_convert, buffer, pixel);
+
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+ const int x = render_pixel_index % width;
+ const int y = render_pixel_index / width;
+
+ ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+ float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ kernel_gpu_film_convert_half_rgba_common_rgba(
+ kfilm_convert,
+ rgba,
+ render_buffer,
+ num_pixels,
+ width,
+ offset,
+ stride,
+ rgba_offset,
+ rgba_stride,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ processor(kfilm_convert, buffer, pixel_rgba);
+ pixel_rgba[3] = 1.0f;
+ });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ kernel_gpu_film_convert_half_rgba_common_rgba(
+ kfilm_convert,
+ rgba,
+ render_buffer,
+ num_pixels,
+ width,
+ offset,
+ stride,
+ rgba_offset,
+ rgba_stride,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ float value;
+ processor(kfilm_convert, buffer, &value);
+
+ pixel_rgba[0] = value;
+ pixel_rgba[1] = value;
+ pixel_rgba[2] = value;
+ pixel_rgba[3] = 1.0f;
+ });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+ KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+ (const KernelFilmConvert kfilm_convert, \
+ float *pixels, \
+ float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
+ { \
+ kernel_gpu_film_convert_common(&kfilm_convert, \
+ pixels, \
+ render_buffer, \
+ num_pixels, \
+ width, \
+ offset, \
+ stride, \
+ rgba_offset, \
+ rgba_stride, \
+ film_get_pass_pixel_##variant); \
+ } \
+ KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+ (const KernelFilmConvert kfilm_convert, \
+ uchar4 *rgba, \
+ float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
+ { \
+ kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+ rgba, \
+ render_buffer, \
+ num_pixels, \
+ width, \
+ offset, \
+ stride, \
+ rgba_offset, \
+ rgba_stride, \
+ film_get_pass_pixel_##variant); \
+ }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ kernel_displace_evaluate(NULL, input, output, offset + i);
+ }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ kernel_background_evaluate(NULL, input, output, offset + i);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_color_preprocess(float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+ float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+ float *color_out = buffer + pass_denoised;
+ color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+ color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+ color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t guiding_pixel_index = x + y * width;
+ float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+ const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+ const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+ float pixel_scale;
+ if (render_pass_sample_count == PASS_UNUSED) {
+ pixel_scale = 1.0f / num_samples;
+ }
+ else {
+ pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+ }
+
+ /* Albedo pass. */
+ if (guiding_pass_albedo != PASS_UNUSED) {
+ kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+ const float *aledo_in = buffer + render_pass_denoising_albedo;
+ float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+ albedo_out[0] = aledo_in[0] * pixel_scale;
+ albedo_out[1] = aledo_in[1] * pixel_scale;
+ albedo_out[2] = aledo_in[2] * pixel_scale;
+ }
+
+ /* Normal pass. */
+ if (render_pass_denoising_normal != PASS_UNUSED) {
+ kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+ const float *normal_in = buffer + render_pass_denoising_normal;
+ float *normal_out = guiding_pixel + guiding_pass_normal;
+
+ normal_out[0] = normal_in[0] * pixel_scale;
+ normal_out[1] = normal_in[1] * pixel_scale;
+ normal_out[2] = normal_in[2] * pixel_scale;
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
+{
+ kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t guiding_pixel_index = x + y * width;
+ float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+ float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+ albedo_out[0] = 0.5f;
+ albedo_out[1] = 0.5f;
+ albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_color_postprocess(float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+ float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+ float pixel_scale;
+ if (pass_sample_count == PASS_UNUSED) {
+ pixel_scale = num_samples;
+ }
+ else {
+ pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+ }
+
+ float *denoised_pixel = buffer + pass_denoised;
+
+ denoised_pixel[0] *= pixel_scale;
+ denoised_pixel[1] *= pixel_scale;
+ denoised_pixel[2] *= pixel_scale;
+
+ if (num_components == 3) {
+ /* Pass without alpha channel. */
+ }
+ else if (!use_compositing) {
+ /* Currently compositing passes are either 3-component (derived by dividing light passes)
+ * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+ * simplifies logic and avoids extra memory allocation. */
+ const float *noisy_pixel = buffer + pass_noisy;
+ denoised_pixel[3] = noisy_pixel[3];
+ }
+ else {
+ /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+ * is an opaque pixel for 4 component passes. */
+
+ denoised_pixel[3] = 0;
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+ uint *num_possible_splits)
+{
+ const int state = ccl_gpu_global_id_x();
+
+ bool can_split = false;
+
+ if (state < num_states) {
+ can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ }
+
+ /* NOTE: All threads specified in the mask must execute the intrinsic. */
+ const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+ if (lane_id == 0) {
+ atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+ int *indices,
+ int *num_indices,
+ IsActiveOp is_active_op)
+{
+ extern ccl_gpu_shared int warp_offset[];
+
+ const uint thread_index = ccl_gpu_thread_idx_x;
+ const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+ const uint warp_index = thread_index / ccl_gpu_warp_size;
+ const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+ /* Test if state corresponding to this thread is active. */
+ const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+ const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
+ }
+
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
+
+ ccl_gpu_syncthreads();
+
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+ if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+ return;
+ }
+
+ int offset = 0;
+ for (int i = 0; i < num_values; i++) {
+ const int new_offset = offset + values[i];
+ values[i] = offset;
+ offset = new_offset;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially. This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+ const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+ extern ccl_gpu_shared OutputT shared_data[];
+
+ const uint tid = ccl_gpu_thread_idx_x;
+ const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+ OutputT sum = zero;
+ for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+ sum += convert(input_data[i]);
+ }
+ shared_data[tid] = sum;
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 512 && tid < 256) {
+ shared_data[tid] = sum = sum + shared_data[tid + 256];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 256 && tid < 128) {
+ shared_data[tid] = sum = sum + shared_data[tid + 128];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 128 && tid < 64) {
+ shared_data[tid] = sum = sum + shared_data[tid + 64];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 64 && tid < 32) {
+ shared_data[tid] = sum = sum + shared_data[tid + 32];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (tid < 32) {
+ for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+ sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+ }
+ }
+
+ if (tid == 0) {
+ output_sum[ccl_gpu_block_idx_x] = sum;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+ int *indices,
+ int *num_indices,
+ int *key_prefix_sum,
+ GetKeyOp get_key_op)
+{
+ const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+ const int key = (state_index < num_states) ? get_key_op(state_index) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+ if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+ const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+ indices[index] = state_index;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h
index 064c99ca100..fb9e094b535 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -15,14 +15,13 @@
* limitations under the License.
*/
-#ifndef __KERNEL_COMPAT_OPTIX_H__
-#define __KERNEL_COMPAT_OPTIX_H__
+#pragma once
#define OPTIX_DONT_INCLUDE_CUDA
#include <optix.h>
#define __KERNEL_GPU__
-#define __KERNEL_CUDA__ // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
#define __KERNEL_OPTIX__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END
@@ -31,14 +30,14 @@
# define ATTR_FALLTHROUGH
#endif
+/* Manual definitions so we can compile without CUDA toolkit. */
+
#ifdef __CUDACC_RTC__
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#else
# include <stdint.h>
#endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
#ifdef CYCLES_CUBIN_CC
# define FLT_MIN 1.175494350822287507969e-38f
@@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject;
# define FLT_EPSILON 1.192092896e-07F
#endif
-__device__ half __float2half(const float f)
-{
- half val;
- asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
- return val;
-}
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-# define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
#define ccl_device \
__device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
#define ccl_device_inline ccl_device
@@ -69,29 +53,75 @@ __device__ half __float2half(const float f)
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
#define ccl_constant const
-#define ccl_local
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
-#define ccl_loop_no_unroll
#define ccl_restrict __restrict__
-#define ccl_ref
+#define ccl_loop_no_unroll
#define ccl_align(n) __align__(n)
-// Zero initialize structs to help the compiler figure out scoping
+/* Zero initialize structs to help the compiler figure out scoping */
#define ccl_optional_struct_init = {}
-#define kernel_data __params.data // See kernel_globals.h
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
+/* No assert supported for CUDA */
#define kernel_assert(cond)
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization. */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y)
+{
+ return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y,
+ const float z)
+{
+ return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+ half val;
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+ return val;
+}
+
/* Types */
#include "util/util_half.h"
#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+ int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+ /* Kernel arguments */
+ const int *path_index_array;
+ float *render_buffer;
+
+ /* Global scene data and textures */
+ KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+ /* Integrator state */
+ IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7f609eab474..c1e36febfc0 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -16,14 +16,20 @@
*/
// clang-format off
-#include "kernel/kernel_compat_optix.h"
-#include "util/util_atomic.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "../cuda/kernel_cuda_image.h" // Texture lookup uses normal CUDA intrinsics
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_bake.h"
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h" // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
// clang-format on
template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
@@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id()
return OBJECT_NONE;
}
-extern "C" __global__ void __raygen__kernel_optix_path_trace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
{
- KernelGlobals kg; // Allocate stack storage for common data
-
- const uint3 launch_index = optixGetLaunchIndex();
- // Keep threads for same pixel together to improve occupancy of warps
- uint pixel_offset = launch_index.x / __params.tile.num_samples;
- uint sample_offset = launch_index.x % __params.tile.num_samples;
-
- kernel_path_trace(&kg,
- __params.tile.buffer,
- __params.tile.start_sample + sample_offset,
- __params.tile.x + pixel_offset,
- __params.tile.y + launch_index.y,
- __params.tile.offset,
- __params.tile.stride);
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_closest(nullptr, path_index);
}
-#ifdef __BAKING__
-extern "C" __global__ void __raygen__kernel_optix_bake()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
{
- KernelGlobals kg;
- const ShaderParams &p = __params.shader;
- kernel_bake_evaluate(&kg,
- p.input,
- p.output,
- (ShaderEvalType)p.type,
- p.filter,
- p.sx + optixGetLaunchIndex().x,
- p.offset,
- p.sample);
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_shadow(nullptr, path_index);
}
-#endif
-extern "C" __global__ void __raygen__kernel_optix_displace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
{
- KernelGlobals kg;
- const ShaderParams &p = __params.shader;
- kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_subsurface(nullptr, path_index);
}
-extern "C" __global__ void __raygen__kernel_optix_background()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
{
- KernelGlobals kg;
- const ShaderParams &p = __params.shader;
- kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_volume_stack(nullptr, path_index);
}
extern "C" __global__ void __miss__kernel_optix_miss()
@@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
{
#ifdef __SHADOW_RECORD_ALL__
+ bool ignore_intersection = false;
+
const uint prim = optixGetPrimitiveIndex();
# ifdef __VISIBILITY_FLAG__
const uint visibility = optixGetPayload_4();
if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
- return optixIgnoreIntersection();
+ ignore_intersection = true;
}
# endif
- // Offset into array with num_hits
- Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
- isect->t = optixGetRayTmax();
- isect->prim = prim;
- isect->object = get_object_id();
- isect->type = kernel_tex_fetch(__prim_type, prim);
-
+ float u = 0.0f, v = 0.0f;
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
- isect->u = 1.0f - barycentrics.y - barycentrics.x;
- isect->v = barycentrics.x;
+ u = 1.0f - barycentrics.y - barycentrics.x;
+ v = barycentrics.x;
}
# ifdef __HAIR__
else {
- const float u = __uint_as_float(optixGetAttribute_0());
- isect->u = u;
- isect->v = __uint_as_float(optixGetAttribute_1());
+ u = __uint_as_float(optixGetAttribute_0());
+ v = __uint_as_float(optixGetAttribute_1());
// Filter out curve endcaps
if (u == 0.0f || u == 1.0f) {
- return optixIgnoreIntersection();
+ ignore_intersection = true;
}
}
# endif
+ int num_hits = optixGetPayload_2();
+ int record_index = num_hits;
+ const int max_hits = optixGetPayload_3();
+
+ if (!ignore_intersection) {
+ optixSetPayload_2(num_hits + 1);
+ }
+
+ Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
# ifdef __TRANSPARENT_SHADOWS__
- // Detect if this surface has a shader with transparent shadows
- if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+ if (num_hits >= max_hits) {
+ /* If maximum number of hits reached, find a hit to replace. */
+ const int num_recorded_hits = min(max_hits, num_hits);
+ float max_recorded_t = isect_array[0].t;
+ int max_recorded_hit = 0;
+
+ for (int i = 1; i < num_recorded_hits; i++) {
+ if (isect_array[i].t > max_recorded_t) {
+ max_recorded_t = isect_array[i].t;
+ max_recorded_hit = i;
+ }
+ }
+
+ if (optixGetRayTmax() >= max_recorded_t) {
+ /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+ * hit anymore. */
+ return;
+ }
+
+ record_index = max_recorded_hit;
+ }
# endif
- // This is an opaque hit or the hit limit has been reached, abort traversal
- optixSetPayload_5(true);
- return optixTerminateRay();
+
+ if (!ignore_intersection) {
+ Intersection *const isect = isect_array + record_index;
+ isect->u = u;
+ isect->v = v;
+ isect->t = optixGetRayTmax();
+ isect->prim = prim;
+ isect->object = get_object_id();
+ isect->type = kernel_tex_fetch(__prim_type, prim);
+
+# ifdef __TRANSPARENT_SHADOWS__
+ // Detect if this surface has a shader with transparent shadows
+ if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+# endif
+ // If no transparent shadows, all light is blocked and we can stop immediately
+ optixSetPayload_5(true);
+ return optixTerminateRay();
# ifdef __TRANSPARENT_SHADOWS__
+ }
+# endif
}
- optixSetPayload_2(optixGetPayload_2() + 1); // num_hits++
-
// Continue tracing
optixIgnoreIntersection();
-# endif
#endif
}
@@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
if (isect.t != FLT_MAX)
isect.t *= len;
- if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+ if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
optixReportIntersection(isect.t / len,
type & PRIMITIVE_ALL,
__float_as_int(isect.u), // Attribute_0
@@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon()
optix_intersection_curve(prim, type);
}
}
-
-extern "C" __global__ void __intersection__curve_all()
-{
- const uint prim = optixGetPrimitiveIndex();
- const uint type = kernel_tex_fetch(__prim_type, prim);
- optix_intersection_curve(prim, type);
-}
#endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
deleted file mode 100644
index b067e53a8bf..00000000000
--- a/intern/cycles/kernel/filter/filter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_H__
-#define __FILTER_H__
-
-/* CPU Filter Kernel Interface */
-
-#include "util/util_types.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
-#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
-
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-CCL_NAMESPACE_END
-
-#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
deleted file mode 100644
index 1c0ac5e2cb7..00000000000
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_DEFINES_H__
-#define __FILTER_DEFINES_H__
-
-#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
-#define XTWY_SIZE (DENOISE_FEATURES + 1)
-
-#define DENOISE_MAX_FRAMES 16
-
-typedef struct TileInfo {
- int offsets[9];
- int strides[9];
- int x[4];
- int y[4];
- int from_render;
- int frames[DENOISE_MAX_FRAMES];
- int num_frames;
- /* TODO(lukas): CUDA doesn't have uint64_t... */
-#ifdef __KERNEL_OPENCL__
- ccl_global float *buffers[9];
-#else
- long long int buffers[9];
-#endif
-} TileInfo;
-
-#ifdef __KERNEL_OPENCL__
-# define CCL_FILTER_TILE_INFO \
- ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
- ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
- ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
- ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
- ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
-# define CCL_FILTER_TILE_INFO_ARG \
- tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
- tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
-# define ccl_get_tile_buffer(id) \
- (id == 0 ? tile_buffer_1 : \
- id == 1 ? tile_buffer_2 : \
- id == 2 ? tile_buffer_3 : \
- id == 3 ? tile_buffer_4 : \
- id == 4 ? tile_buffer_5 : \
- id == 5 ? tile_buffer_6 : \
- id == 6 ? tile_buffer_7 : \
- id == 7 ? tile_buffer_8 : \
- tile_buffer_9)
-#else
-# ifdef __KERNEL_CUDA__
-# define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
-# else
-# define CCL_FILTER_TILE_INFO TileInfo *tile_info
-# endif
-# define ccl_get_tile_buffer(id) (tile_info->buffers[id])
-#endif
-
-#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
deleted file mode 100644
index 8a2af957146..00000000000
--- a/intern/cycles/kernel/filter/filter_features.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
- * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
- * there are any. */
-#define FOR_PIXEL_WINDOW \
- for (int frame = 0; frame < tile_info->num_frames; frame++) { \
- pixel.z = tile_info->frames[frame]; \
- pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
- frame * frame_stride; \
- for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-
-#define END_FOR_PIXEL_WINDOW \
- } \
- pixel_buffer += buffer_w - (high.x - low.x); \
- } \
- }
-
-ccl_device_inline void filter_get_features(int3 pixel,
- const ccl_global float *ccl_restrict buffer,
- float *features,
- bool use_time,
- const float *ccl_restrict mean,
- int pass_stride)
-{
- features[0] = pixel.x;
- features[1] = pixel.y;
- features[2] = fabsf(ccl_get_feature(buffer, 0));
- features[3] = ccl_get_feature(buffer, 1);
- features[4] = ccl_get_feature(buffer, 2);
- features[5] = ccl_get_feature(buffer, 3);
- features[6] = ccl_get_feature(buffer, 4);
- features[7] = ccl_get_feature(buffer, 5);
- features[8] = ccl_get_feature(buffer, 6);
- features[9] = ccl_get_feature(buffer, 7);
- if (use_time) {
- features[10] = pixel.z;
- }
- if (mean) {
- for (int i = 0; i < (use_time ? 11 : 10); i++) {
- features[i] -= mean[i];
- }
- }
-}
-
-ccl_device_inline void filter_get_feature_scales(int3 pixel,
- const ccl_global float *ccl_restrict buffer,
- float *scales,
- bool use_time,
- const float *ccl_restrict mean,
- int pass_stride)
-{
- scales[0] = fabsf(pixel.x - mean[0]);
- scales[1] = fabsf(pixel.y - mean[1]);
- scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
- scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
- ccl_get_feature(buffer, 2) - mean[4],
- ccl_get_feature(buffer, 3) - mean[5]));
- scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
- scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
- ccl_get_feature(buffer, 6) - mean[8],
- ccl_get_feature(buffer, 7) - mean[9]));
- if (use_time) {
- scales[6] = fabsf(pixel.z - mean[10]);
- }
-}
-
-ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
-{
- scale[0] = 1.0f / max(scale[0], 0.01f);
- scale[1] = 1.0f / max(scale[1], 0.01f);
- scale[2] = 1.0f / max(scale[2], 0.01f);
- if (use_time) {
- scale[10] = 1.0f / max(scale[6], 0.01f);
- }
- scale[6] = 1.0f / max(scale[4], 0.01f);
- scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
- scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
-}
-
-ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
- int pass_stride)
-{
- return make_float3(
- ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
-}
-
-ccl_device_inline void design_row_add(float *design_row,
- int rank,
- const ccl_global float *ccl_restrict transform,
- int stride,
- int row,
- float feature,
- int transform_row_stride)
-{
- for (int i = 0; i < rank; i++) {
- design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
- }
-}
-
-/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(
- int3 p_pixel,
- const ccl_global float *ccl_restrict p_buffer,
- int3 q_pixel,
- const ccl_global float *ccl_restrict q_buffer,
- int pass_stride,
- int rank,
- float *design_row,
- const ccl_global float *ccl_restrict transform,
- int stride,
- bool use_time)
-{
- int num_features = use_time ? 11 : 10;
-
- design_row[0] = 1.0f;
- math_vector_zero(design_row + 1, rank);
-
-#define DESIGN_ROW_ADD(I, F) \
- design_row_add(design_row, rank, transform, stride, I, F, num_features);
- DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
- DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
- DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
- DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
- DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
- DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
- DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
- DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
- DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
- DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
- if (use_time) {
- DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
- }
-#undef DESIGN_ROW_ADD
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
deleted file mode 100644
index 59d4ace2bef..00000000000
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
- * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
- * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE \
- for (int frame = 0; frame < tile_info->num_frames; frame++) { \
- pixel.z = tile_info->frames[frame]; \
- pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
- frame * frame_stride; \
- float4 t4 = make_float4(pixel.z); \
- for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- float4 y4 = make_float4(pixel.y); \
- for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
- int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE \
- } \
- pixel_buffer += buffer_w - (high.x - low.x); \
- } \
- }
-
-ccl_device_inline void filter_get_features_sse(float4 x,
- float4 y,
- float4 t,
- int4 active_pixels,
- const float *ccl_restrict buffer,
- float4 *features,
- bool use_time,
- const float4 *ccl_restrict mean,
- int pass_stride)
-{
- int num_features = use_time ? 11 : 10;
-
- features[0] = x;
- features[1] = y;
- features[2] = fabs(ccl_get_feature_sse(0));
- features[3] = ccl_get_feature_sse(1);
- features[4] = ccl_get_feature_sse(2);
- features[5] = ccl_get_feature_sse(3);
- features[6] = ccl_get_feature_sse(4);
- features[7] = ccl_get_feature_sse(5);
- features[8] = ccl_get_feature_sse(6);
- features[9] = ccl_get_feature_sse(7);
- if (use_time) {
- features[10] = t;
- }
-
- if (mean) {
- for (int i = 0; i < num_features; i++) {
- features[i] = features[i] - mean[i];
- }
- }
- for (int i = 0; i < num_features; i++) {
- features[i] = mask(active_pixels, features[i]);
- }
-}
-
-ccl_device_inline void filter_get_feature_scales_sse(float4 x,
- float4 y,
- float4 t,
- int4 active_pixels,
- const float *ccl_restrict buffer,
- float4 *scales,
- bool use_time,
- const float4 *ccl_restrict mean,
- int pass_stride)
-{
- scales[0] = fabs(x - mean[0]);
- scales[1] = fabs(y - mean[1]);
- scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
- scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
- sqr(ccl_get_feature_sse(3) - mean[5]);
- scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
- scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
- sqr(ccl_get_feature_sse(7) - mean[9]);
- if (use_time) {
- scales[6] = fabs(t - mean[10]);
- }
-
- for (int i = 0; i < (use_time ? 7 : 6); i++)
- scales[i] = mask(active_pixels, scales[i]);
-}
-
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
-{
- scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
- scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
- scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
- if (use_time) {
- scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
- }
- scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
- scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
- scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
deleted file mode 100644
index 2ef03dc0a02..00000000000
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_color.h"
-#include "util/util_math.h"
-#include "util/util_math_fast.h"
-#include "util/util_texture.h"
-
-#include "util/util_atomic.h"
-#include "util/util_math_matrix.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "kernel/filter/filter_features.h"
-#ifdef __KERNEL_SSE3__
-# include "kernel/filter/filter_features_sse.h"
-#endif
-
-#include "kernel/filter/filter_prefilter.h"
-
-#ifdef __KERNEL_GPU__
-# include "kernel/filter/filter_transform_gpu.h"
-#else
-# ifdef __KERNEL_SSE3__
-# include "kernel/filter/filter_transform_sse.h"
-# else
-# include "kernel/filter/filter_transform.h"
-# endif
-#endif
-
-#include "kernel/filter/filter_reconstruction.h"
-
-#ifdef __KERNEL_CPU__
-# include "kernel/filter/filter_nlm_cpu.h"
-#else
-# include "kernel/filter/filter_nlm_gpu.h"
-#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
deleted file mode 100644
index 24200c29203..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
- int dy,
- const float *ccl_restrict weight_image,
- const float *ccl_restrict variance_image,
- const float *ccl_restrict scale_image,
- float *difference_image,
- int4 rect,
- int stride,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2)
-{
- /* Strides need to be aligned to 16 bytes. */
- kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
- int aligned_lowx = rect.x & (~3);
- const int numChannels = (channel_offset > 0) ? 3 : 1;
- const float4 channel_fac = make_float4(1.0f / numChannels);
-
- for (int y = rect.y; y < rect.w; y++) {
- int idx_p = y * stride + aligned_lowx;
- int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
- for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
- float4 diff = make_float4(0.0f);
- float4 scale_fac;
- if (scale_image) {
- scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
- make_float4(0.25f),
- make_float4(4.0f));
- }
- else {
- scale_fac = make_float4(1.0f);
- }
- for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
- /* idx_p is guaranteed to be aligned, but idx_q isn't. */
- float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
- float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
- float4 cdiff = color_p - color_q;
- float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
- float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
- diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
- (make_float4(1e-8f) + k_2 * (var_p + var_q));
- }
- load4_a(difference_image, idx_p) = diff * channel_fac;
- }
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(
- const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
- int aligned_lowx = round_down(rect.x, 4);
- for (int y = rect.y; y < rect.w; y++) {
- const int low = max(rect.y, y - f);
- const int high = min(rect.w, y + f + 1);
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y * stride + x) = make_float4(0.0f);
- }
- for (int y1 = low; y1 < high; y1++) {
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
- }
- }
- float fac = 1.0f / (high - low);
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y * stride + x) *= fac;
- }
- }
-}
-
-ccl_device_inline void nlm_blur_horizontal(
- const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
- int aligned_lowx = round_down(rect.x, 4);
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y * stride + x) = make_float4(0.0f);
- }
- }
-
- for (int dx = -f; dx <= f; dx++) {
- aligned_lowx = round_down(rect.x - min(0, dx), 4);
- int highx = rect.z - max(0, dx);
- int4 lowx4 = make_int4(rect.x - min(0, dx));
- int4 highx4 = make_int4(rect.z - max(0, dx));
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = aligned_lowx; x < highx; x += 4) {
- int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
- int4 active = (x4 >= lowx4) & (x4 < highx4);
-
- float4 diff = load4_u(difference_image, y * stride + x + dx);
- load4_a(out_image, y * stride + x) += mask(active, diff);
- }
- }
- }
-
- aligned_lowx = round_down(rect.x, 4);
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
- float4 low = max(make_float4(rect.x), x4 - make_float4(f));
- float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
- load4_a(out_image, y * stride + x) *= rcp(high - low);
- }
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(
- const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
- nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
- int aligned_lowx = round_down(rect.x, 4);
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y * stride + x) = fast_expf4(
- -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
- }
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int dx,
- int dy,
- const float *ccl_restrict difference_image,
- const float *ccl_restrict image,
- float *temp_image,
- float *out_image,
- float *accum_image,
- int4 rect,
- int channel_offset,
- int stride,
- int f)
-{
- nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
-
- int aligned_lowx = round_down(rect.x, 4);
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = aligned_lowx; x < rect.z; x += 4) {
- int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
- int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
-
- int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-
- float4 weight = load4_a(temp_image, idx_p);
- load4_a(accum_image, idx_p) += mask(active, weight);
-
- float4 val = load4_u(image, idx_q);
- if (channel_offset) {
- val += load4_u(image, idx_q + channel_offset);
- val += load4_u(image, idx_q + 2 * channel_offset);
- val *= 1.0f / 3.0f;
- }
-
- load4_a(out_image, idx_p) += mask(active, weight * val);
- }
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
- int dy,
- int t,
- const float *ccl_restrict
- difference_image,
- const float *ccl_restrict buffer,
- float *transform,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int4 rect,
- int4 filter_window,
- int stride,
- int f,
- int pass_stride,
- int frame_offset,
- bool use_time)
-{
- int4 clip_area = rect_clip(rect, filter_window);
- /* fy and fy are in filter-window-relative coordinates,
- * while x and y are in feature-window-relative coordinates. */
- for (int y = clip_area.y; y < clip_area.w; y++) {
- for (int x = clip_area.x; x < clip_area.z; x++) {
- const int low = max(rect.x, x - f);
- const int high = min(rect.z, x + f + 1);
- float sum = 0.0f;
- for (int x1 = low; x1 < high; x1++) {
- sum += difference_image[y * stride + x1];
- }
- float weight = sum * (1.0f / (high - low));
-
- int storage_ofs = coord_to_local_index(filter_window, x, y);
- float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
- float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
- float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
- int *l_rank = rank + storage_ofs;
-
- kernel_filter_construct_gramian(x,
- y,
- 1,
- dx,
- dy,
- t,
- stride,
- pass_stride,
- frame_offset,
- use_time,
- buffer,
- l_transform,
- l_rank,
- weight,
- l_XtWX,
- l_XtWY,
- 0);
- }
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
- const float *ccl_restrict accum_image,
- int4 rect,
- int w)
-{
- for (int y = rect.y; y < rect.w; y++) {
- for (int x = rect.x; x < rect.z; x++) {
- out_image[y * w + x] /= accum_image[y * w + x];
- }
- }
-}
-
-#undef load4_a
-#undef load4_u
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
deleted file mode 100644
index 650c743f34f..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Determines pixel coordinates and offset for the current thread.
- * Returns whether the thread should do any work.
- *
- * All coordinates are relative to the denoising buffer!
- *
- * Window is the rect that should be processed.
- * co is filled with (x, y, dx, dy).
- */
-ccl_device_inline bool get_nlm_coords_window(
- int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
-{
- /* Determine the pixel offset that this thread should apply. */
- int s = 2 * r + 1;
- int si = ccl_global_id(1);
- int sx = si % s;
- int sy = si / s;
- if (sy >= s) {
- return false;
- }
-
- /* Pixels still need to lie inside the denoising buffer after applying the offset,
- * so determine the area for which this is the case. */
- int dx = sx - r;
- int dy = sy - r;
-
- *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
-
- /* Find the intersection of the area that we want to process (window) and the area
- * that can be processed (rect) to get the final area for this offset. */
- int4 clip_area = rect_clip(window, *rect);
-
- /* If the radius is larger than one of the sides of the window,
- * there will be shifts for which there is no usable pixel at all. */
- if (!rect_is_valid(clip_area)) {
- return false;
- }
-
- /* Map the linear thread index to pixels inside the clip area. */
- int x, y;
- if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
- return false;
- }
-
- *co = make_int4(x, y, dx, dy);
-
- *ofs = (sy * s + sx) * stride;
-
- return true;
-}
-
-ccl_device_inline bool get_nlm_coords(
- int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
-{
- return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(
- int x,
- int y,
- int dx,
- int dy,
- const ccl_global float *ccl_restrict weight_image,
- const ccl_global float *ccl_restrict variance_image,
- const ccl_global float *ccl_restrict scale_image,
- ccl_global float *difference_image,
- int4 rect,
- int stride,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2)
-{
- int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
- int numChannels = channel_offset ? 3 : 1;
-
- float diff = 0.0f;
- float scale_fac = 1.0f;
- if (scale_image) {
- scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
- }
-
- for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
- float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
- float pvar = variance_image[idx_p];
- float qvar = sqr(scale_fac) * variance_image[idx_q];
- diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
- }
- if (numChannels > 1) {
- diff *= 1.0f / numChannels;
- }
- difference_image[y * stride + x] = diff;
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(int x,
- int y,
- const ccl_global float *ccl_restrict
- difference_image,
- ccl_global float *out_image,
- int4 rect,
- int stride,
- int f)
-{
- float sum = 0.0f;
- const int low = max(rect.y, y - f);
- const int high = min(rect.w, y + f + 1);
- for (int y1 = low; y1 < high; y1++) {
- sum += difference_image[y1 * stride + x];
- }
- sum *= 1.0f / (high - low);
- out_image[y * stride + x] = sum;
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
- int y,
- const ccl_global float *ccl_restrict
- difference_image,
- ccl_global float *out_image,
- int4 rect,
- int stride,
- int f)
-{
- float sum = 0.0f;
- const int low = max(rect.x, x - f);
- const int high = min(rect.z, x + f + 1);
- for (int x1 = low; x1 < high; x1++) {
- sum += difference_image[y * stride + x1];
- }
- sum *= 1.0f / (high - low);
- out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int x,
- int y,
- int dx,
- int dy,
- const ccl_global float *ccl_restrict
- difference_image,
- const ccl_global float *ccl_restrict image,
- ccl_global float *out_image,
- ccl_global float *accum_image,
- int4 rect,
- int channel_offset,
- int stride,
- int f)
-{
- float sum = 0.0f;
- const int low = max(rect.x, x - f);
- const int high = min(rect.z, x + f + 1);
- for (int x1 = low; x1 < high; x1++) {
- sum += difference_image[y * stride + x1];
- }
- sum *= 1.0f / (high - low);
-
- int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
- if (out_image) {
- atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
- float val = image[idx_q];
- if (channel_offset) {
- val += image[idx_q + channel_offset];
- val += image[idx_q + 2 * channel_offset];
- val *= 1.0f / 3.0f;
- }
- atomic_add_and_fetch_float(out_image + idx_p, sum * val);
- }
- else {
- accum_image[idx_p] = sum;
- }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(
- int x,
- int y,
- int dx,
- int dy,
- int t,
- const ccl_global float *ccl_restrict difference_image,
- const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict transform,
- ccl_global int *rank,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 rect,
- int4 filter_window,
- int stride,
- int f,
- int pass_stride,
- int frame_offset,
- bool use_time,
- int localIdx)
-{
- const int low = max(rect.x, x - f);
- const int high = min(rect.z, x + f + 1);
- float sum = 0.0f;
- for (int x1 = low; x1 < high; x1++) {
- sum += difference_image[y * stride + x1];
- }
- float weight = sum * (1.0f / (high - low));
-
- /* Reconstruction data is only stored for pixels inside the filter window,
- * so compute the pixels's index in there. */
- int storage_ofs = coord_to_local_index(filter_window, x, y);
- transform += storage_ofs;
- rank += storage_ofs;
- XtWX += storage_ofs;
- XtWY += storage_ofs;
-
- kernel_filter_construct_gramian(x,
- y,
- rect_size(filter_window),
- dx,
- dy,
- t,
- stride,
- pass_stride,
- frame_offset,
- use_time,
- buffer,
- transform,
- rank,
- weight,
- XtWX,
- XtWY,
- localIdx);
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(int x,
- int y,
- ccl_global float *out_image,
- const ccl_global float *ccl_restrict
- accum_image,
- int stride)
-{
- out_image[y * stride + x] /= accum_image[y * stride + x];
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
deleted file mode 100644
index 97cecba190e..00000000000
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/**
- * First step of the shadow prefiltering, performs the shadow division and stores all data
- * in a nice and easy rectangular array that can be passed to the NLM filter.
- *
- * Calculates:
- * \param unfiltered: Contains the two half images of the shadow feature pass
- * \param sampleVariance: The sample-based variance calculated in the kernel.
- * Note: This calculation is biased in general,
- * and especially here since the variance of the ratio can only be approximated.
- * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
- * (since it's essentially the buffer variance of the two variance halves)
- * \param bufferVariance: The buffer-based variance of the shadow feature.
- * Unbiased, but quite noisy.
- */
-ccl_device void kernel_filter_divide_shadow(int sample,
- CCL_FILTER_TILE_INFO,
- int x,
- int y,
- ccl_global float *unfilteredA,
- ccl_global float *unfilteredB,
- ccl_global float *sampleVariance,
- ccl_global float *sampleVarianceV,
- ccl_global float *bufferVariance,
- int4 rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
- int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
- int tile = ytile * 3 + xtile;
-
- int offset = tile_info->offsets[tile];
- int stride = tile_info->strides[tile];
- const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
- tile);
- center_buffer += (y * stride + x + offset) * buffer_pass_stride;
- center_buffer += buffer_denoising_offset + 14;
-
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y - rect.y) * buffer_w + (x - rect.x);
- unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
- unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
-
- float varA = center_buffer[2];
- float varB = center_buffer[5];
- int odd_sample = (sample + 1) / 2;
- int even_sample = sample / 2;
-
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
- varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
-
- varA /= max(odd_sample - 1, 1);
- varB /= max(even_sample - 1, 1);
-
- sampleVariance[idx] = 0.5f * (varA + varB) / sample;
- sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
- bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
- (unfilteredA[idx] - unfilteredB[idx]);
-}
-
-/* Load a regular feature from the render buffers into the denoise buffer.
- * Parameters:
- * - sample: The sample amount in the buffer, used to normalize the buffer.
- * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
- * - x, y: Current pixel
- * - mean, variance: Target denoise buffers.
- * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
- */
-ccl_device void kernel_filter_get_feature(int sample,
- CCL_FILTER_TILE_INFO,
- int m_offset,
- int v_offset,
- int x,
- int y,
- ccl_global float *mean,
- ccl_global float *variance,
- float scale,
- int4 rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
- int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
- int tile = ytile * 3 + xtile;
- ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
- (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
- buffer_pass_stride +
- buffer_denoising_offset;
-
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
- float val = scale * center_buffer[m_offset];
- mean[idx] = val;
-
- if (v_offset >= 0) {
- if (sample > 1) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- variance[idx] = max(
- 0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
- }
- else {
- /* Can't compute variance with single sample, just set it very high. */
- variance[idx] = 1e10f;
- }
- }
-}
-
-ccl_device void kernel_filter_write_feature(int sample,
- int x,
- int y,
- int4 buffer_params,
- ccl_global float *from,
- ccl_global float *buffer,
- int out_offset,
- int4 rect)
-{
- ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
- buffer_params.z;
-
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
- combined_buffer[out_offset] = from[idx];
-}
-
-#define GET_COLOR(image) \
- make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
-#define SET_COLOR(image, color) \
- image[idx] = color.x; \
- image[idx + pass_stride] = color.y; \
- image[idx + 2 * pass_stride] = color.z
-
-ccl_device void kernel_filter_detect_outliers(int x,
- int y,
- ccl_global float *in,
- ccl_global float *variance_out,
- ccl_global float *depth,
- ccl_global float *image_out,
- int4 rect,
- int pass_stride)
-{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
- ccl_global float *image_in = in;
- ccl_global float *variance_in = in + 3 * pass_stride;
-
- int n = 0;
- float values[25];
- float pixel_variance, max_variance = 0.0f;
- for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
- for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
- int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
- float3 color = GET_COLOR(image_in);
- color = max(color, make_float3(0.0f, 0.0f, 0.0f));
- float L = average(color);
-
- /* Find the position of L. */
- int i;
- for (i = 0; i < n; i++) {
- if (values[i] > L)
- break;
- }
- /* Make space for L by shifting all following values to the right. */
- for (int j = n; j > i; j--) {
- values[j] = values[j - 1];
- }
- /* Insert L. */
- values[i] = L;
- n++;
-
- float3 pixel_var = GET_COLOR(variance_in);
- float var = average(pixel_var);
- if ((x1 == x) && (y1 == y)) {
- pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
- var;
- }
- else {
- max_variance = max(max_variance, var);
- }
- }
- }
-
- max_variance += 1e-4f;
-
- int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
- float3 color = GET_COLOR(image_in);
- float3 variance = GET_COLOR(variance_in);
- color = max(color, make_float3(0.0f, 0.0f, 0.0f));
- variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
-
- float L = average(color);
-
- float ref = 2.0f * values[(int)(n * 0.75f)];
-
- /* Slightly offset values to avoid false positives in (almost) black areas. */
- max_variance += 1e-5f;
- ref -= 1e-5f;
-
- if (L > ref) {
- /* The pixel appears to be an outlier.
- * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
- * that the pixel should actually be at the reference value: If the reference is within the
- * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
- * unlikely that the pixel should be darker, which indicates a legitimate highlight.
- */
-
- if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
- depth[idx] = -depth[idx];
- color *= ref / L;
- variance = make_float3(max_variance, max_variance, max_variance);
- }
- else {
- float stddev = sqrtf(pixel_variance);
- if (L - 3 * stddev < ref) {
- /* The pixel is an outlier, so negate the depth value to mark it as one.
- * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
- * weights. */
- depth[idx] = -depth[idx];
- float fac = ref / L;
- color *= fac;
- variance *= sqr(fac);
- }
- }
- }
-
- /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
- * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
- * scale by the square of that (since we have variance instead of standard deviation). */
- color = color_highlight_compress(color, &variance);
-
- SET_COLOR(image_out, color);
- SET_COLOR(variance_out, variance);
-}
-
-#undef GET_COLOR
-#undef SET_COLOR
-
-/* Combine A/B buffers.
- * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x,
- int y,
- ccl_global float *mean,
- ccl_global float *variance,
- ccl_global float *a,
- ccl_global float *b,
- int4 rect,
- int r)
-{
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
- if (mean)
- mean[idx] = 0.5f * (a[idx] + b[idx]);
- if (variance) {
- if (r == 0)
- variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
- else {
- variance[idx] = 0.0f;
- float values[25];
- int numValues = 0;
- for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
- for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
- int pidx = (py - rect.y) * buffer_w + (px - rect.x);
- values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
- }
- }
- /* Insertion-sort the variances (fast enough for 25 elements). */
- for (int i = 1; i < numValues; i++) {
- float v = values[i];
- int j;
- for (j = i - 1; j >= 0 && values[j] > v; j--)
- values[j + 1] = values[j];
- values[j + 1] = v;
- }
- variance[idx] = values[(7 * numValues) / 8];
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
deleted file mode 100644
index 17941689ad5..00000000000
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_filter_construct_gramian(int x,
- int y,
- int storage_stride,
- int dx,
- int dy,
- int t,
- int buffer_stride,
- int pass_stride,
- int frame_offset,
- bool use_time,
- const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict
- transform,
- ccl_global int *rank,
- float weight,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int localIdx)
-{
- if (weight < 1e-3f) {
- return;
- }
-
- int p_offset = y * buffer_stride + x;
- int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
-
-#ifdef __KERNEL_GPU__
- const int stride = storage_stride;
-#else
- const int stride = 1;
- (void)storage_stride;
-#endif
-
-#ifdef __KERNEL_CUDA__
- ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
- ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
-#else
- float design_row[DENOISE_FEATURES + 1];
-#endif
-
- float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
-
- /* If the pixel was flagged as an outlier during prefiltering, skip it. */
- if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
- return;
- }
-
- filter_get_design_row_transform(make_int3(x, y, t),
- buffer + p_offset,
- make_int3(x + dx, y + dy, t),
- buffer + q_offset,
- pass_stride,
- *rank,
- design_row,
- transform,
- stride,
- use_time);
-
-#ifdef __KERNEL_GPU__
- math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
- math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
-#else
- math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
- math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
-#endif
-}
-
-ccl_device_inline void kernel_filter_finalize(int x,
- int y,
- ccl_global float *buffer,
- ccl_global int *rank,
- int storage_stride,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 buffer_params,
- int sample)
-{
-#ifdef __KERNEL_GPU__
- const int stride = storage_stride;
-#else
- const int stride = 1;
- (void)storage_stride;
-#endif
-
- if (XtWX[0] < 1e-3f) {
- /* There is not enough information to determine a denoised result.
- * As a fallback, keep the original value of the pixel. */
- return;
- }
-
- /* The weighted average of pixel colors (essentially, the NLM-filtered image).
- * In case the solution of the linear model fails due to numerical issues or
- * returns nonsensical negative values, fall back to this value. */
- float3 mean_color = XtWY[0] / XtWX[0];
-
- math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
-
- float3 final_color = XtWY[0];
- if (!isfinite3_safe(final_color) ||
- (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
- final_color = mean_color;
- }
-
- /* Clamp pixel value to positive values and reverse the highlight compression transform. */
- final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
-
- ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
- buffer_params.z;
- if (buffer_params.w >= 0) {
- final_color *= sample;
- if (buffer_params.w > 0) {
- final_color.x += combined_buffer[buffer_params.w + 0];
- final_color.y += combined_buffer[buffer_params.w + 1];
- final_color.z += combined_buffer[buffer_params.w + 2];
- }
- }
- combined_buffer[0] = final_color.x;
- combined_buffer[1] = final_color.y;
- combined_buffer[2] = final_color.z;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
deleted file mode 100644
index 880a661214e..00000000000
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
- CCL_FILTER_TILE_INFO,
- int x,
- int y,
- int4 rect,
- int pass_stride,
- int frame_stride,
- bool use_time,
- float *transform,
- int *rank,
- int radius,
- float pca_threshold)
-{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
- float features[DENOISE_FEATURES];
-
- const float *ccl_restrict pixel_buffer;
- int3 pixel;
-
- int num_features = use_time ? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
- /* === Shift feature passes to have mean 0. === */
- float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, num_features);
- FOR_PIXEL_WINDOW
- {
- filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add(feature_means, features, num_features);
- }
- END_FOR_PIXEL_WINDOW
-
- math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
- /* === Scale the shifted feature passes to a range of [-1; 1] ===
- * Will be baked into the transform later. */
- float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, num_features);
-
- FOR_PIXEL_WINDOW
- {
- filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max(feature_scale, features, num_features);
- }
- END_FOR_PIXEL_WINDOW
-
- filter_calculate_scale(feature_scale, use_time);
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimensional feature space to a reduced feature
- * (r-feature) space which generally has fewer dimensions.
- * This mainly helps to prevent over-fitting. */
- float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, num_features);
- FOR_PIXEL_WINDOW
- {
- filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, num_features);
- math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
- }
- END_FOR_PIXEL_WINDOW
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
- *rank = 0;
- /* Prevent over-fitting when a small window is used. */
- int max_rank = min(num_features, num_pixels / 3);
- if (pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for (int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i * num_features + i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- if (i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i * num_features + i];
- reduced_energy += s;
- }
- }
- else {
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i * num_features + i];
- if (i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- /* Bake the feature scaling into the transformation matrix. */
- for (int i = 0; i < (*rank); i++) {
- math_vector_mul(transform + i * num_features, feature_scale, num_features);
- }
- math_matrix_transpose(transform, num_features, 1);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
deleted file mode 100644
index ec258a5212a..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
- CCL_FILTER_TILE_INFO,
- int x,
- int y,
- int4 rect,
- int pass_stride,
- int frame_stride,
- bool use_time,
- ccl_global float *transform,
- ccl_global int *rank,
- int radius,
- float pca_threshold,
- int transform_stride,
- int localIdx)
-{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
-#ifdef __KERNEL_CUDA__
- ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
- ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
-#else
- float features[DENOISE_FEATURES];
-#endif
-
- int num_features = use_time ? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
- const ccl_global float *ccl_restrict pixel_buffer;
- int3 pixel;
-
- /* === Shift feature passes to have mean 0. === */
- float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, num_features);
- FOR_PIXEL_WINDOW
- {
- filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add(feature_means, features, num_features);
- }
- END_FOR_PIXEL_WINDOW
-
- math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
- /* === Scale the shifted feature passes to a range of [-1; 1] ===
- * Will be baked into the transform later. */
- float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, num_features);
-
- FOR_PIXEL_WINDOW
- {
- filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max(feature_scale, features, num_features);
- }
- END_FOR_PIXEL_WINDOW
-
- filter_calculate_scale(feature_scale, use_time);
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimensional feature space to a reduced feature
- * (r-feature) space which generally has fewer dimensions.
- * This mainly helps to prevent over-fitting. */
- float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, num_features);
- FOR_PIXEL_WINDOW
- {
- filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, num_features);
- math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
- }
- END_FOR_PIXEL_WINDOW
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
- *rank = 0;
- /* Prevent over-fitting when a small window is used. */
- int max_rank = min(num_features, num_pixels / 3);
- if (pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for (int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i * num_features + i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- if (i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i * num_features + i];
- reduced_energy += s;
- }
- }
- else {
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i * num_features + i];
- if (i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- math_matrix_transpose(transform, num_features, transform_stride);
-
- /* Bake the feature scaling into the transformation matrix. */
- for (int i = 0; i < num_features; i++) {
- for (int j = 0; j < (*rank); j++) {
- transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
deleted file mode 100644
index 0304d990f9f..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
- CCL_FILTER_TILE_INFO,
- int x,
- int y,
- int4 rect,
- int pass_stride,
- int frame_stride,
- bool use_time,
- float *transform,
- int *rank,
- int radius,
- float pca_threshold)
-{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
- float4 features[DENOISE_FEATURES];
- const float *ccl_restrict pixel_buffer;
- int3 pixel;
-
- int num_features = use_time ? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
- /* === Shift feature passes to have mean 0. === */
- float4 feature_means[DENOISE_FEATURES];
- math_vector_zero_sse(feature_means, num_features);
- FOR_PIXEL_WINDOW_SSE
- {
- filter_get_features_sse(
- x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add_sse(feature_means, num_features, features);
- }
- END_FOR_PIXEL_WINDOW_SSE
-
- float4 pixel_scale = make_float4(1.0f / num_pixels);
- for (int i = 0; i < num_features; i++) {
- feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
- }
-
- /* === Scale the shifted feature passes to a range of [-1; 1] ===
- * Will be baked into the transform later. */
- float4 feature_scale[DENOISE_FEATURES];
- math_vector_zero_sse(feature_scale, num_features);
- FOR_PIXEL_WINDOW_SSE
- {
- filter_get_feature_scales_sse(
- x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max_sse(feature_scale, features, num_features);
- }
- END_FOR_PIXEL_WINDOW_SSE
-
- filter_calculate_scale_sse(feature_scale, use_time);
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimensional feature space to a reduced feature
- * (r-feature) space which generally has fewer dimensions.
- * This mainly helps to prevent over-fitting. */
- float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
- math_matrix_zero_sse(feature_matrix_sse, num_features);
- FOR_PIXEL_WINDOW_SSE
- {
- filter_get_features_sse(
- x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul_sse(features, num_features, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
- }
- END_FOR_PIXEL_WINDOW_SSE
-
- float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
- math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
- *rank = 0;
- /* Prevent over-fitting when a small window is used. */
- int max_rank = min(num_features, num_pixels / 3);
- if (pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for (int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i * num_features + i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- if (i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i * num_features + i];
- reduced_energy += s;
- }
- }
- else {
- for (int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i * num_features + i];
- if (i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- math_matrix_transpose(transform, num_features, 1);
-
- /* Bake the feature scaling into the transformation matrix. */
- for (int i = 0; i < num_features; i++) {
- math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 5ff4d5f7053..4de824cc277 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
// clang-format off
#include "kernel/geom/geom_attribute.h"
#include "kernel/geom/geom_object.h"
@@ -31,4 +33,5 @@
#include "kernel/geom/geom_curve_intersect.h"
#include "kernel/geom/geom_volume.h"
#include "kernel/geom/geom_primitive.h"
+#include "kernel/geom/geom_shader_data.h"
// clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index b37797ac21b..9532a21fec7 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Attributes
@@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN
* Lookup of attributes is different between OSL and SVM, as OSL is ustring
* based while for SVM we use integer ids. */
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd);
-ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd)
{
if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
return ATTR_PRIM_SUBD;
@@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
/* Find attribute based on ID */
-ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object)
{
return kernel_tex_fetch(__objects, object).attribute_map_offset;
}
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
+ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg,
const ShaderData *sd,
uint id)
{
@@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
/* Transform matrix attribute on meshes */
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg,
+ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc)
{
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b5a62a31ca9..a827a67ce7a 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -12,6 +12,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Curve Primitive
@@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN
/* Reading attributes on various curve elements */
-ccl_device float curve_attribute_float(
- KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float curve_attribute_float(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const AttributeDescriptor desc,
+ float *dx,
+ float *dy)
{
if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
@@ -64,7 +69,7 @@ ccl_device float curve_attribute_float(
}
}
-ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
+ccl_device float2 curve_attribute_float2(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float2 *dx,
@@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
}
}
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
+ccl_device float3 curve_attribute_float3(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float3 *dx,
@@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
}
}
-ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+ccl_device float4 curve_attribute_float4(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float4 *dx,
@@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
/* Curve thickness */
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd)
{
float r = 0.0f;
@@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
/* Curve location for motion pass, linear interpolation between keys and
* ignoring radius because we do the same for the motion keys */
-ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd)
{
float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
/* Curve tangent normal */
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd)
{
float3 tgN = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index e25bf5b4660..213f3e62ee0 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -15,6 +15,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Curve primitive intersection functions.
@@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
}
ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+ float *ray_tfar,
const float dt,
const float4 curve[4],
float u,
@@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
if (fabsf(f) < f_err && fabsf(g) < g_err) {
t += dt;
- if (!(0.0f <= t && t <= isect->t)) {
+ if (!(0.0f <= t && t <= *ray_tfar)) {
return false; /* Rejects NaNs */
}
if (!(u >= 0.0f && u <= 1.0f)) {
@@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
}
/* Record intersection. */
+ *ray_tfar = t;
isect->t = t;
isect->u = u;
isect->v = 0.0f;
@@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
ccl_device bool curve_intersect_recursive(const float3 ray_orig,
const float3 ray_dir,
+ float ray_tfar,
float4 curve[4],
Intersection *isect)
{
@@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
}
/* Intersect with cap-planes. */
- float2 tp = make_float2(-dt, isect->t - dt);
+ float2 tp = make_float2(-dt, ray_tfar - dt);
tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
const float2 h0 = half_plane_intersect(
float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
CURVE_NUM_BEZIER_SUBDIVISIONS;
if (depth >= termDepth) {
found |= curve_intersect_iterative(
- ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+ ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
}
else {
recurse = true;
}
}
- if (valid1 && (tp1.x + dt <= isect->t)) {
+ if (valid1 && (tp1.x + dt <= ray_tfar)) {
const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
CURVE_NUM_BEZIER_SUBDIVISIONS;
if (depth >= termDepth) {
found |= curve_intersect_iterative(
- ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+ ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
}
else {
recurse = true;
@@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
ccl_device_inline bool ribbon_intersect(const float3 ray_org,
const float3 ray_dir,
- const float ray_tfar,
+ float ray_tfar,
const int N,
float4 curve[4],
Intersection *isect)
@@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
/* Intersect quad. */
float vu, vv, vt;
- bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+ bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
if (valid0) {
/* ignore self intersections */
@@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
vv = 2.0f * vv - 1.0f;
/* Record intersection. */
+ ray_tfar = vt;
isect->t = vt;
isect->u = u + vu * step_size;
isect->v = vv;
@@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
return false;
}
-ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg,
Intersection *isect,
const float3 P,
const float3 dir,
+ const float tmax,
uint visibility,
int object,
int curveAddr,
@@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
/* todo: adaptive number of subdivisions could help performance here. */
const int subdivisions = kernel_data.bvh.curve_subdivisions;
- if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+ if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
isect->prim = curveAddr;
isect->object = object;
isect->type = type;
@@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
return false;
}
else {
- if (curve_intersect_recursive(P, dir, curve, isect)) {
+ if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
isect->prim = curveAddr;
isect->object = object;
isect->type = type;
@@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
}
}
-ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
ShaderData *sd,
- const Intersection *isect,
- const Ray *ray)
+ float3 P,
+ float3 D,
+ float t,
+ const int isect_object,
+ const int isect_prim)
{
- float t = isect->t;
- float3 P = ray->P;
- float3 D = ray->D;
-
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-# endif
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_inverse_transform(kg, sd);
P = transform_point(&tfm, P);
D = transform_direction(&tfm, D * t);
D = normalize_len(D, &t);
}
- int prim = kernel_tex_fetch(__prim_index, isect->prim);
+ int prim = kernel_tex_fetch(__prim_index, isect_prim);
float4 v00 = kernel_tex_fetch(__curves, prim);
int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
}
- sd->u = isect->u;
-
P = P + D * t;
- const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+ const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
const float3 dPdu = float4_to_float3(dPdu4);
if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
/* Rounded smooth normals for ribbons, to approximate thick curve shape. */
const float3 tangent = normalize(dPdu);
const float3 bitangent = normalize(cross(tangent, -D));
- const float sine = isect->v;
+ const float sine = sd->v;
const float cosine = safe_sqrtf(1.0f - sine * sine);
sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
sd->Ng = -D;
- sd->v = isect->v;
# if 0
/* This approximates the position and geometric normal of a thick curve too,
@@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
/* Thick curves, compute normal using direction from inside the curve.
* This could be optimized by recording the normal in the intersection,
* however for Optix this would go beyond the size of the payload. */
- const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+ const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
const float3 Ng = normalize(P - P_inside);
sd->N = Ng;
@@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
sd->dPdv = cross(dPdu, sd->Ng);
# endif
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-# endif
-
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
P = transform_point(&tfm, P);
}
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0f66f4af755..5294da03145 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -12,6 +12,8 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Motion Curve Primitive
@@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN
#ifdef __HAIR__
-ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg,
int object,
uint id,
AttributeElement *elem)
@@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
}
-ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg,
int offset,
int numkeys,
int numsteps,
@@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
/* return 2 curve key locations */
ccl_device_inline void motion_curve_keys_linear(
- KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+ const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
{
/* get motion info */
int numsteps, numkeys;
@@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear(
keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
}
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg,
int offset,
int numkeys,
int numsteps,
@@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
}
/* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys(const KernelGlobals *kg,
int object,
int prim,
float time,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 53d6b92dd7e..eb4a39e062b 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -25,11 +25,13 @@
* and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Time interpolation of vertex positions and normals */
-ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
int object,
uint id,
AttributeElement *elem)
@@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
}
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg,
uint4 tri_vindex,
int offset,
int numverts,
@@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
}
}
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg,
uint4 tri_vindex,
int offset,
int numverts,
@@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
}
ccl_device_inline void motion_triangle_vertices(
- KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+ const KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
{
/* get motion info */
int numsteps, numverts;
@@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices(
}
ccl_device_inline float3 motion_triangle_smooth_normal(
- KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+ const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
{
/* get motion info */
int numsteps, numverts;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 859d919f0bb..ec7e4b07d76 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -25,6 +25,8 @@
* and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Refine triangle intersection to more precise hit point. For rays that travel
@@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN
* a closer distance.
*/
-ccl_device_inline float3 motion_triangle_refine(
- KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg,
+ ShaderData *sd,
+ float3 P,
+ float3 D,
+ float t,
+ const int isect_object,
+ const int isect_prim,
+ float3 verts[3])
{
- float3 P = ray->P;
- float3 D = ray->D;
- float t = isect->t;
-
#ifdef __INTERSECTION_REFINE__
- if (isect->object != OBJECT_NONE) {
+ if (isect_object != OBJECT_NONE) {
if (UNLIKELY(t == 0.0f)) {
return P;
}
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-# endif
+ const Transform tfm = object_get_inverse_transform(kg, sd);
P = transform_point(&tfm, P);
D = transform_direction(&tfm, D * t);
@@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine(
/* Compute refined position. */
P = P + D * rt;
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-# endif
-
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
P = transform_point(&tfm, P);
}
@@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine(
#endif
}
-/* Same as above, except that isect->t is assumed to be in object space
+/* Same as above, except that t is assumed to be in object space
* for instancing.
*/
@@ -97,27 +92,22 @@ ccl_device_noinline
ccl_device_inline
# endif
float3
- motion_triangle_refine_local(KernelGlobals *kg,
+ motion_triangle_refine_local(const KernelGlobals *kg,
ShaderData *sd,
- const Intersection *isect,
- const Ray *ray,
+ float3 P,
+ float3 D,
+ float t,
+ const int isect_object,
+ const int isect_prim,
float3 verts[3])
{
# ifdef __KERNEL_OPTIX__
- /* isect->t is always in world space with OptiX. */
- return motion_triangle_refine(kg, sd, isect, ray, verts);
+ /* t is always in world space with OptiX. */
+ return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
# else
- float3 P = ray->P;
- float3 D = ray->D;
- float t = isect->t;
-
# ifdef __INTERSECTION_REFINE__
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-# endif
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_inverse_transform(kg, sd);
P = transform_point(&tfm, P);
D = transform_direction(&tfm, D);
@@ -138,13 +128,8 @@ ccl_device_inline
P = P + D * rt;
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-# endif
-
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
P = transform_point(&tfm, P);
}
@@ -160,10 +145,11 @@ ccl_device_inline
* time and do a ray intersection with the resulting triangle.
*/
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg,
Intersection *isect,
float3 P,
float3 dir,
+ float tmax,
float time,
uint visibility,
int object,
@@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
float t, u, v;
if (ray_triangle_intersect(P,
dir,
- isect->t,
+ tmax,
#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
(ssef *)verts,
#else
@@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
* Returns whether traversal should be stopped.
*/
#ifdef __BVH_LOCAL__
-ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg,
LocalIntersection *local_isect,
float3 P,
float3 dir,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 7a91f8041f7..85c4f0ca522 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -25,6 +25,8 @@
* and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Setup of motion triangle specific parts of ShaderData, moved into this one
@@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN
* normals */
/* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(
- KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local)
+ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg,
+ ShaderData *sd,
+ const float3 P,
+ const float3 D,
+ const float ray_t,
+ const int isect_object,
+ const int isect_prim,
+ bool is_local)
{
/* Get shader. */
sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
@@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup(
/* Compute refined position. */
#ifdef __BVH_LOCAL__
if (is_local) {
- sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts);
+ sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
}
else
#endif /* __BVH_LOCAL__*/
{
- sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+ sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
}
/* Compute face normal. */
float3 Ng;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index fe73335a335..7d6ad7b4fe3 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -22,6 +22,8 @@
* directly primitives in the BVH with world space locations applied, and the object
* ID is looked up afterwards. */
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Object attributes, for now a fixed size and contents */
@@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST
/* Object to world space transformation */
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg,
int object,
enum ObjectTransform type)
{
@@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
/* Lamp to world space transformation */
-ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse)
{
if (inverse) {
return kernel_tex_fetch(__lights, lamp).itfm;
@@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo
/* Object to world space transformation for motion vectors */
-ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg,
int object,
enum ObjectVectorTransform type)
{
@@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg
/* Motion blurred object transformations */
#ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg,
int object,
float time)
{
@@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
return tfm;
}
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg,
int object,
float time,
Transform *itfm)
@@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
}
#endif
+/* Get transform matrix for shading point. */
+
+ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+ return (sd->object_flag & SD_OBJECT_MOTION) ?
+ sd->ob_tfm_motion :
+ object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#else
+ return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#endif
+}
+
+ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg,
+ const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+ return (sd->object_flag & SD_OBJECT_MOTION) ?
+ sd->ob_itfm_motion :
+ object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#else
+ return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+}
/* Transform position from object to world space */
-ccl_device_inline void object_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_position_transform(const KernelGlobals *kg,
const ShaderData *sd,
float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point_auto(&sd->ob_tfm, *P);
-#else
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ *P = transform_point_auto(&sd->ob_tfm_motion, *P);
+ return;
+ }
+#endif
+
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*P = transform_point(&tfm, *P);
-#endif
}
/* Transform position from world to object space */
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg,
const ShaderData *sd,
float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point_auto(&sd->ob_itfm, *P);
-#else
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ *P = transform_point_auto(&sd->ob_itfm_motion, *P);
+ return;
+ }
+#endif
+
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
*P = transform_point(&tfm, *P);
-#endif
}
/* Transform normal from world to object space */
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg,
const ShaderData *sd,
float3 *N)
{
#ifdef __OBJECT_MOTION__
- if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
- *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+ *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N));
+ }
+ return;
}
-#else
+#endif
+
if (sd->object != OBJECT_NONE) {
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
@@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
*N = normalize(transform_direction_transposed(&tfm, *N));
}
-#endif
}
/* Transform normal from object to world space */
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(const KernelGlobals *kg,
+ const ShaderData *sd,
+ float3 *N)
{
#ifdef __OBJECT_MOTION__
- *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
-#else
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N));
+ return;
+ }
+#endif
+
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
}
/* Transform direction vector from object to world space */
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(const KernelGlobals *kg,
+ const ShaderData *sd,
+ float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction_auto(&sd->ob_tfm, *D);
-#else
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ *D = transform_direction_auto(&sd->ob_tfm_motion, *D);
+ return;
+ }
+#endif
+
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*D = transform_direction(&tfm, *D);
-#endif
}
/* Transform direction vector from world to object space */
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg,
const ShaderData *sd,
float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction_auto(&sd->ob_itfm, *D);
-#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
- *D = transform_direction(&tfm, *D);
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ *D = transform_direction_auto(&sd->ob_itfm_motion, *D);
+ return;
+ }
#endif
+
+ const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+ *D = transform_direction(&tfm, *D);
}
/* Object center position */
-ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd)
{
if (sd->object == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
#ifdef __OBJECT_MOTION__
- return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w);
+ }
+#endif
+
Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
}
/* Color of the object */
-ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_color(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
/* Pass ID number of object */
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0.0f;
@@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
/* Per lamp random number for shader variation */
-ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp)
{
if (lamp == LAMP_NONE)
return 0.0f;
@@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
/* Per object random number for shader variation */
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0.0f;
@@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
/* Particle ID from which this object was generated */
-ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0;
@@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
/* Generated texture coordinate on surface from where object was instanced */
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
/* UV texture coordinate on surface from where object was instanced */
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
@@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
/* Information about mesh for motion blurred triangles and curves */
ccl_device_inline void object_motion_info(
- KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+ const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
{
if (numkeys) {
*numkeys = kernel_tex_fetch(__objects, object).numkeys;
@@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info(
/* Offset to an objects patch map */
-ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0;
@@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
/* Volume step size */
-ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE) {
return 1.0f;
@@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
return kernel_tex_fetch(__objects, object).volume_density;
}
-ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE) {
return kernel_data.background.volume_step_size;
@@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
/* Pass ID for shader */
-ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
+ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd)
{
return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
}
/* Cryptomatte ID */
-ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0.0f;
@@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
return kernel_tex_fetch(__objects, object).cryptomatte_object;
}
-ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object)
{
if (object == OBJECT_NONE)
return 0;
@@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec
/* Particle data from which object was instanced */
-ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle)
{
return kernel_tex_fetch(__particles, particle).index;
}
-ccl_device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(const KernelGlobals *kg, int particle)
{
return kernel_tex_fetch(__particles, particle).age;
}
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(const KernelGlobals *kg, int particle)
{
return kernel_tex_fetch(__particles, particle).lifetime;
}
-ccl_device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(const KernelGlobals *kg, int particle)
{
return kernel_tex_fetch(__particles, particle).size;
}
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle)
{
return kernel_tex_fetch(__particles, particle).rotation;
}
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(const KernelGlobals *kg, int particle)
{
return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
}
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle)
{
return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
}
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle)
{
return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
}
@@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
/* Transform ray into object space to enter static object in BVH */
ccl_device_inline float bvh_instance_push(
- KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir)
{
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push(
*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
*idir = bvh_inverse_direction(*dir);
- if (t != FLT_MAX) {
- t *= len;
- }
-
- return t;
+ return len;
}
/* Transform ray to exit static object in BVH. */
-ccl_device_inline float bvh_instance_pop(
- KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg,
+ int object,
+ const Ray *ray,
+ float3 *P,
+ float3 *dir,
+ float3 *idir,
+ float t)
{
if (t != FLT_MAX) {
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop(
/* Same as above, but returns scale factor to apply to multiple intersection distances */
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg,
int object,
const Ray *ray,
float3 *P,
@@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
#ifdef __OBJECT_MOTION__
/* Transform ray into object space to enter motion blurred object in BVH */
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg,
int object,
const Ray *ray,
float3 *P,
float3 *dir,
float3 *idir,
- float t,
Transform *itfm)
{
object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
*dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
*idir = bvh_inverse_direction(*dir);
- if (t != FLT_MAX) {
- t *= len;
- }
-
- return t;
+ return len;
}
/* Transform ray to exit motion blurred object in BVH. */
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg,
int object,
const Ray *ray,
float3 *P,
@@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
/* Same as above, but returns scale factor to apply to multiple intersection distances */
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg,
int object,
const Ray *ray,
float3 *P,
@@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
#endif
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
-#ifdef __KERNEL_OPENCL__
-ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg,
- const ShaderData *sd,
- ccl_addr_space float3 *P)
-{
- float3 private_P = *P;
- object_position_transform(kg, sd, &private_P);
- *P = private_P;
-}
-
-ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
- const ShaderData *sd,
- ccl_addr_space float3 *D)
-{
- float3 private_D = *D;
- object_dir_transform(kg, sd, &private_D);
- *D = private_D;
-}
-
-ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
- const ShaderData *sd,
- ccl_addr_space float3 *N)
-{
- float3 private_N = *N;
- object_normal_transform(kg, sd, &private_N);
- *N = private_N;
-}
-#endif
-
-#ifndef __KERNEL_OPENCL__
-# define object_position_transform_auto object_position_transform
-# define object_dir_transform_auto object_dir_transform
-# define object_normal_transform_auto object_normal_transform
-#else
-# define object_position_transform_auto object_position_transform_addrspace
-# define object_dir_transform_auto object_dir_transform_addrspace
-# define object_normal_transform_auto object_normal_transform_addrspace
-#endif
+#define object_position_transform_auto object_position_transform
+#define object_dir_transform_auto object_dir_transform
+#define object_normal_transform_auto object_normal_transform
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 9c1768f05db..ce0fc15f196 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -24,6 +24,8 @@
* language governing permissions and limitations under the Apache License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
typedef struct PatchHandle {
@@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *
/* retrieve PatchHandle from patch coords */
ccl_device_inline PatchHandle
-patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v)
{
PatchHandle handle;
@@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl
/* retrieve patch control indices */
-ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
+ccl_device_inline int patch_eval_indices(const KernelGlobals *kg,
const PatchHandle *handle,
int channel,
int indices[PATCH_MAX_CONTROL_VERTS])
@@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
/* evaluate patch basis functions */
-ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
+ccl_device_inline void patch_eval_basis(const KernelGlobals *kg,
const PatchHandle *handle,
float u,
float v,
@@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
/* generic function for evaluating indices and weights from patch coords */
-ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
+ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg,
int object,
int patch,
float u,
@@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
/* functions for evaluating attributes on patches */
-ccl_device float patch_eval_float(KernelGlobals *kg,
+ccl_device float patch_eval_float(const KernelGlobals *kg,
const ShaderData *sd,
int offset,
int patch,
@@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg,
return val;
}
-ccl_device float2 patch_eval_float2(KernelGlobals *kg,
+ccl_device float2 patch_eval_float2(const KernelGlobals *kg,
const ShaderData *sd,
int offset,
int patch,
@@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg,
return val;
}
-ccl_device float3 patch_eval_float3(KernelGlobals *kg,
+ccl_device float3 patch_eval_float3(const KernelGlobals *kg,
const ShaderData *sd,
int offset,
int patch,
@@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
return val;
}
-ccl_device float4 patch_eval_float4(KernelGlobals *kg,
+ccl_device float4 patch_eval_float4(const KernelGlobals *kg,
const ShaderData *sd,
int offset,
int patch,
@@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg,
return val;
}
-ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg,
const ShaderData *sd,
int offset,
int patch,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index aeb044c9ad3..ba31b12e817 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -19,6 +19,10 @@
* Generic functions to look up mesh, curve and volume primitive attributes for
* shading and render passes. */
+#pragma once
+
+#include "kernel/kernel_projection.h"
+
CCL_NAMESPACE_BEGIN
/* Surface Attributes
@@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN
* attributes for performance, mainly for GPU performance to avoid bringing in
* heavy volume interpolation code. */
-ccl_device_inline float primitive_surface_attribute_float(
- KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const AttributeDescriptor desc,
+ float *dx,
+ float *dy)
{
if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
if (subd_triangle_patch(kg, sd) == ~0)
@@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float(
}
}
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float2 *dx,
@@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
}
}
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float3 *dx,
@@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
}
}
-ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg,
- const ShaderData *sd,
- const AttributeDescriptor desc,
- float4 *dx,
- float4 *dy)
+ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const AttributeDescriptor desc,
+ float4 *dx,
+ float4 *dy)
{
if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
if (subd_triangle_patch(kg, sd) == ~0)
@@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd,
return sd->type == PRIMITIVE_VOLUME;
}
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
+ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc)
{
@@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
}
}
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc)
{
@@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
}
}
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
+ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc)
{
@@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
/* Default UV coordinate */
-ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd)
{
const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
@@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
/* Ptex coordinates */
-ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
+ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
{
/* storing ptex data as attributes is not memory efficient but simple for tests */
const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
@@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
/* Surface tangent */
-ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd)
{
#ifdef __HAIR__
if (sd->type & PRIMITIVE_ALL_CURVE)
@@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
/* Motion vector for motion pass */
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd)
{
/* center position */
float3 center;
diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h
new file mode 100644
index 00000000000..fb2cb5cb1ea
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_shader_data.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to initialize ShaderData given.
+ *
+ * Could be from an incoming ray, intersection or sampled position. */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* ShaderData setup from incoming ray */
+
+#ifdef __OBJECT_MOTION__
+ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ float time)
+{
+ if (sd->object_flag & SD_OBJECT_MOTION) {
+ sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
+ sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+ }
+}
+#endif
+
+/* TODO: break this up if it helps reduce register pressure to load data from
+ * global memory as we write it to shaderdata. */
+ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ const Ray *ccl_restrict ray,
+ const Intersection *ccl_restrict isect)
+{
+ /* Read intersection data into shader globals.
+ *
+ * TODO: this is redundant, could potentially remove some of this from
+ * ShaderData but would need to ensure that it also works for shadow
+ * shader evaluation. */
+ sd->u = isect->u;
+ sd->v = isect->v;
+ sd->ray_length = isect->t;
+ sd->type = isect->type;
+ sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+ isect->object;
+ sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+ sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+ sd->lamp = LAMP_NONE;
+ sd->flag = 0;
+
+ /* Read matrices and time. */
+ sd->time = ray->time;
+
+#ifdef __OBJECT_MOTION__
+ shader_setup_object_transforms(kg, sd, ray->time);
+#endif
+
+ /* Read ray data into shader globals. */
+ sd->I = -ray->D;
+
+#ifdef __HAIR__
+ if (sd->type & PRIMITIVE_ALL_CURVE) {
+ /* curve */
+ curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+ }
+ else
+#endif
+ if (sd->type & PRIMITIVE_TRIANGLE) {
+ /* static triangle */
+ float3 Ng = triangle_normal(kg, sd);
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+
+ /* vectors */
+ sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+ sd->Ng = Ng;
+ sd->N = Ng;
+
+ /* smooth normal */
+ if (sd->shader & SHADER_SMOOTH_NORMAL)
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+#ifdef __DPDU__
+ /* dPdu/dPdv */
+ triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+#endif
+ }
+ else {
+ /* motion triangle */
+ motion_triangle_shader_setup(
+ kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+ }
+
+ sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+ if (isect->object != OBJECT_NONE) {
+ /* instance transform */
+ object_normal_transform_auto(kg, sd, &sd->N);
+ object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+ object_dir_transform_auto(kg, sd, &sd->dPdu);
+ object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+ }
+
+ /* backfacing test */
+ bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+ if (backfacing) {
+ sd->flag |= SD_BACKFACING;
+ sd->Ng = -sd->Ng;
+ sd->N = -sd->N;
+#ifdef __DPDU__
+ sd->dPdu = -sd->dPdu;
+ sd->dPdv = -sd->dPdv;
+#endif
+ }
+
+#ifdef __RAY_DIFFERENTIALS__
+ /* differentials */
+ differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
+ differential_incoming_compact(&sd->dI, ray->D, ray->dD);
+ differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+#endif
+}
+
+/* ShaderData setup from position sampled on mesh */
+
+ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ const float3 P,
+ const float3 Ng,
+ const float3 I,
+ int shader,
+ int object,
+ int prim,
+ float u,
+ float v,
+ float t,
+ float time,
+ bool object_space,
+ int lamp)
+{
+ /* vectors */
+ sd->P = P;
+ sd->N = Ng;
+ sd->Ng = Ng;
+ sd->I = I;
+ sd->shader = shader;
+ if (prim != PRIM_NONE)
+ sd->type = PRIMITIVE_TRIANGLE;
+ else if (lamp != LAMP_NONE)
+ sd->type = PRIMITIVE_LAMP;
+ else
+ sd->type = PRIMITIVE_NONE;
+
+ /* primitive */
+ sd->object = object;
+ sd->lamp = LAMP_NONE;
+ /* Currently no access to bvh prim index for strand sd->prim. */
+ sd->prim = prim;
+ sd->u = u;
+ sd->v = v;
+ sd->time = time;
+ sd->ray_length = t;
+
+ sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+ sd->object_flag = 0;
+ if (sd->object != OBJECT_NONE) {
+ sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#ifdef __OBJECT_MOTION__
+ shader_setup_object_transforms(kg, sd, time);
+#endif
+ }
+ else if (lamp != LAMP_NONE) {
+ sd->lamp = lamp;
+ }
+
+ /* transform into world space */
+ if (object_space) {
+ object_position_transform_auto(kg, sd, &sd->P);
+ object_normal_transform_auto(kg, sd, &sd->Ng);
+ sd->N = sd->Ng;
+ object_dir_transform_auto(kg, sd, &sd->I);
+ }
+
+ if (sd->type & PRIMITIVE_TRIANGLE) {
+ /* smooth normal */
+ if (sd->shader & SHADER_SMOOTH_NORMAL) {
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+ if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+ object_normal_transform_auto(kg, sd, &sd->N);
+ }
+ }
+
+ /* dPdu/dPdv */
+#ifdef __DPDU__
+ triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+
+ if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+ object_dir_transform_auto(kg, sd, &sd->dPdu);
+ object_dir_transform_auto(kg, sd, &sd->dPdv);
+ }
+#endif
+ }
+ else {
+#ifdef __DPDU__
+ sd->dPdu = zero_float3();
+ sd->dPdv = zero_float3();
+#endif
+ }
+
+ /* backfacing test */
+ if (sd->prim != PRIM_NONE) {
+ bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+ if (backfacing) {
+ sd->flag |= SD_BACKFACING;
+ sd->Ng = -sd->Ng;
+ sd->N = -sd->N;
+#ifdef __DPDU__
+ sd->dPdu = -sd->dPdu;
+ sd->dPdv = -sd->dPdv;
+#endif
+ }
+ }
+
+#ifdef __RAY_DIFFERENTIALS__
+ /* no ray differentials here yet */
+ sd->dP = differential3_zero();
+ sd->dI = differential3_zero();
+ sd->du = differential_zero();
+ sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup for displacement */
+
+ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ int object,
+ int prim,
+ float u,
+ float v)
+{
+ float3 P, Ng, I = zero_float3();
+ int shader;
+
+ triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
+
+ /* force smooth shading for displacement */
+ shader |= SHADER_SMOOTH_NORMAL;
+
+ shader_setup_from_sample(
+ kg,
+ sd,
+ P,
+ Ng,
+ I,
+ shader,
+ object,
+ prim,
+ u,
+ v,
+ 0.0f,
+ 0.5f,
+ !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+ LAMP_NONE);
+}
+
+/* ShaderData setup from ray into background */
+
+ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ const float3 ray_P,
+ const float3 ray_D,
+ const float ray_time)
+{
+ /* for NDC coordinates */
+ sd->ray_P = ray_P;
+
+ /* vectors */
+ sd->P = ray_D;
+ sd->N = -ray_D;
+ sd->Ng = -ray_D;
+ sd->I = -ray_D;
+ sd->shader = kernel_data.background.surface_shader;
+ sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+ sd->object_flag = 0;
+ sd->time = ray_time;
+ sd->ray_length = 0.0f;
+
+ sd->object = OBJECT_NONE;
+ sd->lamp = LAMP_NONE;
+ sd->prim = PRIM_NONE;
+ sd->u = 0.0f;
+ sd->v = 0.0f;
+
+#ifdef __DPDU__
+ /* dPdu/dPdv */
+ sd->dPdu = zero_float3();
+ sd->dPdv = zero_float3();
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+ /* differentials */
+ sd->dP = differential3_zero(); /* TODO: ray->dP */
+ differential_incoming(&sd->dI, sd->dP);
+ sd->du = differential_zero();
+ sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup from point inside volume */
+
+#ifdef __VOLUME__
+ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg,
+ ShaderData *ccl_restrict sd,
+ const Ray *ccl_restrict ray)
+{
+
+ /* vectors */
+ sd->P = ray->P;
+ sd->N = -ray->D;
+ sd->Ng = -ray->D;
+ sd->I = -ray->D;
+ sd->shader = SHADER_NONE;
+ sd->flag = 0;
+ sd->object_flag = 0;
+ sd->time = ray->time;
+ sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
+
+ sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
+ sd->lamp = LAMP_NONE;
+ sd->prim = PRIM_NONE;
+ sd->type = PRIMITIVE_VOLUME;
+
+ sd->u = 0.0f;
+ sd->v = 0.0f;
+
+# ifdef __DPDU__
+ /* dPdu/dPdv */
+ sd->dPdu = zero_float3();
+ sd->dPdv = zero_float3();
+# endif
+
+# ifdef __RAY_DIFFERENTIALS__
+ /* differentials */
+ sd->dP = differential3_zero(); /* TODO ray->dD */
+ differential_incoming(&sd->dI, sd->dP);
+ sd->du = differential_zero();
+ sd->dv = differential_zero();
+# endif
+
+ /* for NDC coordinates */
+ sd->ray_P = ray->P;
+ sd->ray_dP = ray->dP;
+}
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 9eceb996926..877b2ece15b 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -16,18 +16,20 @@
/* Functions for retrieving attributes on triangles produced from subdivision meshes */
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Patch index for triangle, -1 if not subdivision triangle */
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd)
{
return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
}
/* UV coords of triangle within patch */
-ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
+ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg,
const ShaderData *sd,
float2 uv[3])
{
@@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
/* Vertex indices of patch */
-ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch)
{
uint4 indices;
@@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch
/* Originating face for patch */
-ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch)
{
return kernel_tex_fetch(__patches, patch + 4);
}
/* Number of corners on originating face */
-ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch)
{
return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
}
/* Indices of the four corners that are used by the patch */
-ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg,
+ int patch,
+ int corners[4])
{
uint4 data;
@@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch,
/* Reading attributes on various subdivision triangle elements */
-ccl_device_noinline float subd_triangle_attribute_float(
- KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const AttributeDescriptor desc,
+ float *dx,
+ float *dy)
{
int patch = subd_triangle_patch(kg, sd);
@@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float(
}
}
-ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
+ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float2 *dx,
@@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
}
}
-ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
+ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float3 *dx,
@@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
}
}
-ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index ff7909ca425..910fb122c6d 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -20,10 +20,12 @@
* ray intersection we use a precomputed triangle storage to accelerate
* intersection at the cost of more memory usage */
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* Normal on triangle. */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
@@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
}
/* Point and normal on triangle. */
-ccl_device_inline void triangle_point_normal(
- KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(const KernelGlobals *kg,
+ int object,
+ int prim,
+ float u,
+ float v,
+ float3 *P,
+ float3 *Ng,
+ int *shader)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal(
/* Triangle vertex locations */
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3])
{
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0));
@@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Triangle vertex locations and vertex normals */
-ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
+ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg,
int prim,
float3 P[3],
float3 N[3])
@@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
/* Interpolate smooth vertex normal from vertices */
ccl_device_inline float3
-triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
+triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
}
ccl_device_inline float3 triangle_smooth_normal_unnormalized(
- KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v)
+ const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
/* Ray differentials on triangle */
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
+ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg,
int prim,
ccl_addr_space float3 *dPdu,
ccl_addr_space float3 *dPdv)
@@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
/* Reading attributes on various triangle elements */
-ccl_device float triangle_attribute_float(
- KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float triangle_attribute_float(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const AttributeDescriptor desc,
+ float *dx,
+ float *dy)
{
if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) {
float f0, f1, f2;
@@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float(
}
}
-ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
+ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float2 *dx,
@@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
}
}
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
+ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float3 *dx,
@@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
}
}
-ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc,
float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b0cce274b94..30b77ebd2eb 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -20,12 +20,17 @@
* intersection at the cost of more memory usage.
*/
+#pragma once
+
+#include "kernel/kernel_random.h"
+
CCL_NAMESPACE_BEGIN
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect(const KernelGlobals *kg,
Intersection *isect,
float3 P,
float3 dir,
+ float tmax,
uint visibility,
int object,
int prim_addr)
@@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
float t, u, v;
if (ray_triangle_intersect(P,
dir,
- isect->t,
+ tmax,
#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
ssef_verts,
#else
@@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
*/
#ifdef __BVH_LOCAL__
-ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg,
LocalIntersection *local_isect,
float3 P,
float3 dir,
@@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
* http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
*/
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine(const KernelGlobals *kg,
ShaderData *sd,
- const Intersection *isect,
- const Ray *ray)
+ float3 P,
+ float3 D,
+ float t,
+ const int isect_object,
+ const int isect_prim)
{
- float3 P = ray->P;
- float3 D = ray->D;
- float t = isect->t;
-
#ifdef __INTERSECTION_REFINE__
- if (isect->object != OBJECT_NONE) {
+ if (isect_object != OBJECT_NONE) {
if (UNLIKELY(t == 0.0f)) {
return P;
}
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-# endif
+ const Transform tfm = object_get_inverse_transform(kg, sd);
P = transform_point(&tfm, P);
D = transform_direction(&tfm, D * t);
@@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
P = P + D * t;
- const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
P = P + D * rt;
}
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-# endif
-
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
P = transform_point(&tfm, P);
}
@@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
#endif
}
-/* Same as above, except that isect->t is assumed to be in object space for
+/* Same as above, except that t is assumed to be in object space for
* instancing.
*/
-ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg,
ShaderData *sd,
- const Intersection *isect,
- const Ray *ray)
+ float3 P,
+ float3 D,
+ float t,
+ const int isect_object,
+ const int isect_prim)
{
#ifdef __KERNEL_OPTIX__
- /* isect->t is always in world space with OptiX. */
- return triangle_refine(kg, sd, isect, ray);
+ /* t is always in world space with OptiX. */
+ return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
#else
- float3 P = ray->P;
- float3 D = ray->D;
- float t = isect->t;
-
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-# endif
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_inverse_transform(kg, sd);
P = transform_point(&tfm, P);
D = transform_direction(&tfm, D);
@@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
P = P + D * t;
# ifdef __INTERSECTION_REFINE__
- const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
}
# endif /* __INTERSECTION_REFINE__ */
- if (isect->object != OBJECT_NONE) {
-# ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-# else
- Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-# endif
-
+ if (isect_object != OBJECT_NONE) {
+ const Transform tfm = object_get_transform(kg, sd);
P = transform_point(&tfm, P);
}
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 809b76245ba..2bcd7e56b5f 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -23,13 +23,15 @@
* 3D voxel textures can be assigned as attributes per mesh, which means the
* same shader can be used for volume objects with different densities, etc. */
+#pragma once
+
CCL_NAMESPACE_BEGIN
#ifdef __VOLUME__
/* Return position normalized to 0..1 in mesh bounds */
-ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg,
const ShaderData *sd,
float3 P)
{
@@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value)
}
}
-ccl_device float4 volume_attribute_float4(KernelGlobals *kg,
+ccl_device float4 volume_attribute_float4(const KernelGlobals *kg,
const ShaderData *sd,
const AttributeDescriptor desc)
{
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
new file mode 100644
index 00000000000..96db606cee1
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* This helps with AA but it's not the real solution as it does not AA the geometry
+ * but it's better than nothing, thus committed. */
+ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
+{
+ /* use mirror repeat (like opengl texture) so that if the barycentric
+ * coordinate goes past the end of the triangle it is not always clamped
+ * to the same value, gives ugly patterns */
+ u /= max;
+ float fu = floorf(u);
+ u = u - fu;
+
+ return ((((int)fu) & 1) ? 1.0f - u : u) * max;
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS,
+ const ccl_global KernelWorkTile *ccl_restrict tile,
+ ccl_global float *render_buffer,
+ const int x,
+ const int y,
+ const int scheduled_sample)
+{
+ PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+ /* Initialize path state to give basic buffer access and allow early outputs. */
+ path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+ /* Check whether the pixel has converged and should not be sampled anymore. */
+ if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+ return false;
+ }
+
+ /* Always count the sample, even if the camera sample will reject the ray. */
+ const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+ /* Setup render buffers. */
+ const int index = INTEGRATOR_STATE(path, render_pixel_index);
+ const int pass_stride = kernel_data.film.pass_stride;
+ render_buffer += index * pass_stride;
+
+ ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+ ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+
+ const int seed = __float_as_uint(primitive[0]);
+ int prim = __float_as_uint(primitive[1]);
+ if (prim == -1) {
+ return false;
+ }
+
+ prim += kernel_data.bake.tri_offset;
+
+ /* Random number generator. */
+ const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
+
+ float filter_x, filter_y;
+ if (sample == 0) {
+ filter_x = filter_y = 0.5f;
+ }
+ else {
+ path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
+ }
+
+ /* Initialize path state for path integration. */
+ path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+ /* Barycentric UV with sub-pixel offset. */
+ float u = primitive[2];
+ float v = primitive[3];
+
+ float dudx = differential[0];
+ float dudy = differential[1];
+ float dvdx = differential[2];
+ float dvdy = differential[3];
+
+ if (sample > 0) {
+ u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
+ v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+ 1.0f - u);
+ }
+
+ /* Position and normal on triangle. */
+ float3 P, Ng;
+ int shader;
+ triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader);
+ if (kernel_data.film.pass_background != PASS_UNUSED) {
+ /* Environment baking. */
+
+ /* Setup and write ray. */
+ Ray ray ccl_optional_struct_init;
+ ray.P = zero_float3();
+ ray.D = normalize(P);
+ ray.t = FLT_MAX;
+ ray.time = 0.5f;
+ ray.dP = differential_zero_compact();
+ ray.dD = differential_zero_compact();
+ integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Setup next kernel to execute. */
+ INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ }
+ else {
+ /* Surface baking. */
+ const float3 N = (shader & SHADER_SMOOTH_NORMAL) ? triangle_smooth_normal(kg, Ng, prim, u, v) :
+ Ng;
+
+ /* Setup ray. */
+ Ray ray ccl_optional_struct_init;
+ ray.P = P + N;
+ ray.D = -N;
+ ray.t = FLT_MAX;
+ ray.time = 0.5f;
+
+ /* Setup differentials. */
+ float3 dPdu, dPdv;
+ triangle_dPdudv(kg, prim, &dPdu, &dPdv);
+ differential3 dP;
+ dP.dx = dPdu * dudx + dPdv * dvdx;
+ dP.dy = dPdu * dudy + dPdv * dvdy;
+ ray.dP = differential_make_compact(dP);
+ ray.dD = differential_zero_compact();
+
+ /* Write ray. */
+ integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Setup and write intersection. */
+ Intersection isect ccl_optional_struct_init;
+ isect.object = kernel_data.bake.object_index;
+ isect.prim = prim;
+ isect.u = u;
+ isect.v = v;
+ isect.t = 1.0f;
+ isect.type = PRIMITIVE_TRIANGLE;
+#ifdef __EMBREE__
+ isect.Ng = Ng;
+#endif
+ integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+ /* Setup next kernel to execute. */
+ const int shader_index = shader & SHADER_MASK;
+ const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
+ if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
+ }
+ else {
+ INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
+ }
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
new file mode 100644
index 00000000000..58e7bde4c94
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg,
+ const int sample,
+ const int x,
+ const int y,
+ const uint rng_hash,
+ Ray *ray)
+{
+ /* Filter sampling. */
+ float filter_u, filter_v;
+
+ if (sample == 0) {
+ filter_u = 0.5f;
+ filter_v = 0.5f;
+ }
+ else {
+ path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
+ }
+
+ /* Depth of field sampling. */
+ float lens_u = 0.0f, lens_v = 0.0f;
+ if (kernel_data.cam.aperturesize > 0.0f) {
+ path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
+ }
+
+ /* Motion blur time sampling. */
+ float time = 0.0f;
+#ifdef __CAMERA_MOTION__
+ if (kernel_data.cam.shuttertime != -1.0f)
+ time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
+#endif
+
+ /* Generate camera ray. */
+ camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS,
+ const ccl_global KernelWorkTile *ccl_restrict tile,
+ ccl_global float *render_buffer,
+ const int x,
+ const int y,
+ const int scheduled_sample)
+{
+ PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+ /* Initialize path state to give basic buffer access and allow early outputs. */
+ path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+ /* Check whether the pixel has converged and should not be sampled anymore. */
+ if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+ return false;
+ }
+
+ /* Count the sample and get an effective sample for this pixel.
+ *
+ * This logic allows to both count actual number of samples per pixel, and to add samples to this
+ * pixel after it was converged and samples were added somewhere else (in which case the
+ * `scheduled_sample` will be different from actual number of samples in this pixel). */
+ const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+ /* Initialize random number seed for path. */
+ const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+
+ {
+ /* Generate camera ray. */
+ Ray ray;
+ integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+ if (ray.t == 0.0f) {
+ return true;
+ }
+
+ /* Write camera ray to state. */
+ integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+ }
+
+ /* Initialize path state for path integration. */
+ path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+ /* Continue with intersect_closest kernel, optionally initializing volume
+ * stack before that if the camera may be inside a volume. */
+ if (kernel_data.cam.is_inside_volume) {
+ INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+ }
+ else {
+ INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
new file mode 100644
index 00000000000..34ca6814534
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<uint32_t current_kernel>
+ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS,
+ const int shader_flags)
+{
+
+ /* Optional AO bounce termination.
+ * We continue evaluating emissive/transparent surfaces and volumes, similar
+ * to direct lighting. Only if we know there are none can we terminate the
+ * path immediately. */
+ if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+ if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ }
+ else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME;
+ }
+ else {
+ return true;
+ }
+ }
+
+ /* Load random number state. */
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+ /* We perform path termination in this kernel to avoid launching shade_surface
+ * and evaluating the shader when not needed. Only for emission and transparent
+ * surfaces in front of emission do we need to evaluate the shader, since we
+ * perform MIS as part of indirect rays. */
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag);
+
+ if (probability != 1.0f) {
+ const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
+
+ if (probability == 0.0f || terminate >= probability) {
+ if (shader_flags & SD_HAS_EMISSION) {
+ /* Mark path to be terminated right after shader evaluation on the surface. */
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+ }
+ else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+ /* TODO: only do this for emissive volumes. */
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME;
+ }
+ else {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+ INTEGRATOR_STATE_ARGS,
+ const Intersection *ccl_restrict isect,
+ const int shader,
+ const int shader_flags)
+{
+ /* Note on scheduling.
+ *
+ * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+ * or without raytrace support, depending on the shader used.
+ *
+ * When there is a shadow catcher split the general idea is to have the following configuration:
+ *
+ * - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+ * will trace shadow catcher object.
+ *
+ * - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+ * the matte ray.
+ *
+ * - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+ * on. The background kernel will then schedule surface shading for the matte ray.
+ *
+ * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
+ * the matte path. */
+
+ const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
+ (kernel_data.film.pass_ao != PASS_UNUSED));
+
+ if (use_raytrace_kernel) {
+ INTEGRATOR_PATH_NEXT_SORTED(
+ current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+ }
+ else {
+ INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+ }
+
+#ifdef __SHADOW_CATCHER__
+ const int object_flags = intersection_get_object_flags(kg, isect);
+ if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) {
+ if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) {
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+ if (use_raytrace_kernel) {
+ INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ }
+ else {
+ INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ }
+ }
+ else if (use_raytrace_kernel) {
+ INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+ }
+ else {
+ INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+ }
+ }
+#endif
+}
+
+ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
+{
+ PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);
+
+ /* Read ray from integrator state into local memory. */
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+ kernel_assert(ray.t != 0.0f);
+
+ const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS);
+ const int last_isect_prim = INTEGRATOR_STATE(isect, prim);
+ const int last_isect_object = INTEGRATOR_STATE(isect, object);
+
+ /* Trick to use short AO rays to approximate indirect light at the end of the path. */
+ if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+ ray.t = kernel_data.integrator.ao_bounces_distance;
+
+ const int last_object = last_isect_object != OBJECT_NONE ?
+ last_isect_object :
+ kernel_tex_fetch(__prim_object, last_isect_prim);
+ const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
+ if (object_ao_distance != 0.0f) {
+ ray.t = object_ao_distance;
+ }
+ }
+
+ /* Scene Intersection. */
+ Intersection isect ccl_optional_struct_init;
+ bool hit = scene_intersect(kg, &ray, visibility, &isect);
+
+ /* TODO: remove this and do it in the various intersection functions instead. */
+ if (!hit) {
+ isect.prim = PRIM_NONE;
+ }
+
+ /* Light intersection for MIS. */
+ if (kernel_data.integrator.use_lamp_mis) {
+ /* NOTE: if we make lights visible to camera rays, we'll need to initialize
+ * these in the path_state_init. */
+ const int last_type = INTEGRATOR_STATE(isect, type);
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+
+ hit = lights_intersect(
+ kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) ||
+ hit;
+ }
+
+ /* Write intersection result into global integrator state memory. */
+ integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+#ifdef __VOLUME__
+ if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+ const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+ const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+ const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+ if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+ INTEGRATOR_STATE_PASS, flags)) {
+ /* Continue with volume kernel if we are inside a volume, regardless
+ * if we hit anything. */
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+ }
+ else {
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ }
+ return;
+ }
+#endif
+
+ if (hit) {
+ /* Hit a surface, continue with light or surface kernel. */
+ if (isect.type & PRIMITIVE_LAMP) {
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+ return;
+ }
+ else {
+ /* Hit a surface, continue with surface kernel unless terminated. */
+ const int shader = intersection_get_shader(kg, &isect);
+ const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+ if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+ INTEGRATOR_STATE_PASS, flags)) {
+ integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+ INTEGRATOR_STATE_PASS, &isect, shader, flags);
+ return;
+ }
+ else {
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ return;
+ }
+ }
+ }
+ else {
+ /* Nothing hit, continue with background kernel. */
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ return;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
new file mode 100644
index 00000000000..5bd9cfda4a4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Visibility for the shadow ray. */
+ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS)
+{
+ uint visibility = PATH_RAY_SHADOW;
+
+#ifdef __SHADOW_CATCHER__
+ const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag);
+ visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+#endif
+
+ return visibility;
+}
+
+ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS,
+ const Ray *ray,
+ const uint visibility)
+{
+ /* Mask which will pick only opaque visibility bits from the `visibility`.
+ * Calculate the mask at compile time: the visibility will either be a high bits for the shadow
+ * catcher objects, or lower bits for the regular objects (there is no need to check the path
+ * state here again). */
+ constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) |
+ PATH_RAY_SHADOW_OPAQUE;
+
+ Intersection isect;
+ const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect);
+
+ if (!opaque_hit) {
+ INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+ }
+
+ return opaque_hit;
+}
+
+ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS)
+{
+ const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+ const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce);
+
+ return max(transparent_max_bounce - transparent_bounce - 1, 0);
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
+ const Ray *ray,
+ const uint visibility)
+{
+ Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
+
+ /* Limit the number hits to the max transparent bounces allowed and the size that we
+ * have available in the integrator state. */
+ const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS);
+ const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
+ uint num_hits = 0;
+ bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+
+ /* If number of hits exceed the transparent bounces limit, make opaque. */
+ if (num_hits > max_transparent_hits) {
+ opaque_hit = true;
+ }
+
+ if (!opaque_hit) {
+ uint num_recorded_hits = min(num_hits, max_hits);
+
+ if (num_recorded_hits > 0) {
+ sort_intersections(isect, num_recorded_hits);
+
+ /* Write intersection result into global integrator state memory. */
+ for (int hit = 0; hit < num_recorded_hits; hit++) {
+ integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
+ }
+ }
+
+ INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits;
+ }
+ else {
+ INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+ }
+
+ return opaque_hit;
+}
+#endif
+
+ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS)
+{
+ PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
+
+ /* Read ray from integrator state into local memory. */
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Compute visibility. */
+ const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS);
+
+#ifdef __TRANSPARENT_SHADOWS__
+ /* TODO: compile different kernels depending on this? Especially for OptiX
+ * conditional trace calls are bad. */
+ const bool opaque_hit =
+ (kernel_data.integrator.transparent_shadows) ?
+ integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) :
+ integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility);
+#else
+ const bool opaque_hit = integrate_intersect_shadow_opaque(
+ INTEGRATOR_STATE_PASS, &ray, visibility);
+#endif
+
+ if (opaque_hit) {
+ /* Hit an opaque surface, shadow path ends here. */
+ INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ return;
+ }
+ else {
+ /* Hit nothing or transparent surfaces, continue to shadow kernel
+ * for shading and render buffer output.
+ *
+ * TODO: could also write to render buffer directly if no transparent shadows?
+ * Could save a kernel execution for the common case. */
+ INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index c10ecc426c6..7c090952dc7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,16 +14,23 @@
* limitations under the License.
*/
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
+#pragma once
-__kernel void kernel_ocl_path_trace_state_buffer_size(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- uint num_threads,
- ccl_global uint64_t *size)
+#include "kernel/integrator/integrator_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
{
- ((KernelGlobals*)kg)->data = data;
- *size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+ PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE);
+
+#ifdef __SUBSURFACE__
+ if (subsurface_scatter(INTEGRATOR_STATE_PASS)) {
+ return;
+ }
+#endif
+
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
new file mode 100644
index 00000000000..60d8a8e3e54
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS,
+ const float3 from_P,
+ const float3 to_P)
+{
+ PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+ ShaderDataTinyStorage stack_sd_storage;
+ ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+ kernel_assert(kernel_data.integrator.use_volumes);
+
+ Ray volume_ray ccl_optional_struct_init;
+ volume_ray.P = from_P;
+ volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+
+#ifdef __VOLUME_RECORD_ALL__
+ Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+ uint num_hits = scene_intersect_volume_all(
+ kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+ if (num_hits > 0) {
+ Intersection *isect = hits;
+
+ qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+ for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+ shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+ volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+ }
+ }
+#else
+ Intersection isect;
+ int step = 0;
+ while (step < 2 * VOLUME_STACK_SIZE &&
+ scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+ shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+ volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+
+ /* Move ray forward. */
+ volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+ if (volume_ray.t != FLT_MAX) {
+ volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
+ }
+ ++step;
+ }
+#endif
+}
+
+ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
+{
+ PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+ ShaderDataTinyStorage stack_sd_storage;
+ ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+ Ray volume_ray ccl_optional_struct_init;
+ integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray);
+ volume_ray.t = FLT_MAX;
+
+ const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY);
+ int stack_index = 0, enclosed_index = 0;
+
+ /* Write background shader. */
+ if (kernel_data.background.volume_shader != SHADER_NONE) {
+ const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+ integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+ stack_index++;
+ }
+
+#ifdef __VOLUME_RECORD_ALL__
+ Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+ uint num_hits = scene_intersect_volume_all(
+ kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+ if (num_hits > 0) {
+ int enclosed_volumes[VOLUME_STACK_SIZE];
+ Intersection *isect = hits;
+
+ qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+ for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+ shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+ if (stack_sd->flag & SD_BACKFACING) {
+ bool need_add = true;
+ for (int i = 0; i < enclosed_index && need_add; ++i) {
+ /* If ray exited the volume and never entered to that volume
+ * it means that camera is inside such a volume.
+ */
+ if (enclosed_volumes[i] == stack_sd->object) {
+ need_add = false;
+ }
+ }
+ for (int i = 0; i < stack_index && need_add; ++i) {
+ /* Don't add intersections twice. */
+ VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+ if (entry.object == stack_sd->object) {
+ need_add = false;
+ break;
+ }
+ }
+ if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+ const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+ integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+ ++stack_index;
+ }
+ }
+ else {
+ /* If ray from camera enters the volume, this volume shouldn't
+ * be added to the stack on exit.
+ */
+ enclosed_volumes[enclosed_index++] = stack_sd->object;
+ }
+ }
+ }
+#else
+ int enclosed_volumes[VOLUME_STACK_SIZE];
+ int step = 0;
+
+ while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
+ step < 2 * VOLUME_STACK_SIZE) {
+ Intersection isect;
+ if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+ break;
+ }
+
+ shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+ if (stack_sd->flag & SD_BACKFACING) {
+ /* If ray exited the volume and never entered to that volume
+ * it means that camera is inside such a volume.
+ */
+ bool need_add = true;
+ for (int i = 0; i < enclosed_index && need_add; ++i) {
+ /* If ray exited the volume and never entered to that volume
+ * it means that camera is inside such a volume.
+ */
+ if (enclosed_volumes[i] == stack_sd->object) {
+ need_add = false;
+ }
+ }
+ for (int i = 0; i < stack_index && need_add; ++i) {
+ /* Don't add intersections twice. */
+ VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+ if (entry.object == stack_sd->object) {
+ need_add = false;
+ break;
+ }
+ }
+ if (need_add) {
+ const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+ integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+ ++stack_index;
+ }
+ }
+ else {
+ /* If ray from camera enters the volume, this volume shouldn't
+ * be added to the stack on exit.
+ */
+ enclosed_volumes[enclosed_index++] = stack_sd->object;
+ }
+
+ /* Move ray forward. */
+ volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+ ++step;
+ }
+#endif
+
+ /* Write terminator. */
+ const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
+ integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
new file mode 100644
index 00000000000..91363ea1c7f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ /* Each kernel indicates the next kernel to execute, so here we simply
+ * have to check what that kernel is and execute it.
+ *
+ * TODO: investigate if we can use device side enqueue for GPUs to avoid
+ * having to compile this big kernel. */
+ while (true) {
+ if (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+ /* First handle any shadow paths before we potentially create more shadow paths. */
+ switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+ integrator_intersect_shadow(INTEGRATOR_STATE_PASS);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ default:
+ kernel_assert(0);
+ break;
+ }
+ }
+ else if (INTEGRATOR_STATE(path, queued_kernel)) {
+ /* Then handle regular path kernels. */
+ switch (INTEGRATOR_STATE(path, queued_kernel)) {
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+ integrator_intersect_closest(INTEGRATOR_STATE_PASS);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+ integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+ integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+ integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+ integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+ integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+ integrator_intersect_subsurface(INTEGRATOR_STATE_PASS);
+ break;
+ case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+ integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS);
+ break;
+ default:
+ kernel_assert(0);
+ break;
+ }
+ }
+ else {
+ break;
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
new file mode 100644
index 00000000000..3e4cc837e9b
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __BACKGROUND__
+ const int shader = kernel_data.background.surface_shader;
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+ /* Use visibility flag to skip lights. */
+ if (shader & SHADER_EXCLUDE_ANY) {
+ if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+ ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+ (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+ ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+ ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+ ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+ return zero_float3();
+ }
+
+ /* Use fast constant background color if available. */
+ float3 L = zero_float3();
+ if (!shader_constant_emission_eval(kg, shader, &L)) {
+ /* Evaluate background shader. */
+
+ /* TODO: does aliasing like this break automatic SoA in CUDA?
+ * Should we instead store closures separate from ShaderData? */
+ ShaderDataTinyStorage emission_sd_storage;
+ ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+ PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+ shader_setup_from_background(kg,
+ emission_sd,
+ INTEGRATOR_STATE(ray, P),
+ INTEGRATOR_STATE(ray, D),
+ INTEGRATOR_STATE(ray, time));
+
+ PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+ PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+ shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+ INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+ L = shader_background_eval(emission_sd);
+ }
+
+ /* Background MIS weights. */
+# ifdef __BACKGROUND_MIS__
+ /* Check if background light exists or if we should skip pdf. */
+ if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
+ const float3 ray_P = INTEGRATOR_STATE(ray, P);
+ const float3 ray_D = INTEGRATOR_STATE(ray, D);
+ const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+ const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+
+ /* multiple importance sampling, get background light pdf for ray
+ * direction, and compute weight with respect to BSDF pdf */
+ const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+ const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
+
+ L *= mis_weight;
+ }
+# endif
+
+ return L;
+#else
+ return make_float3(0.8f, 0.8f, 0.8f);
+#endif
+}
+
+ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ /* Accumulate transparency for transparent background. We can skip background
+ * shader evaluation unless a background pass is used. */
+ bool eval_background = true;
+ float transparent = 0.0f;
+
+ const bool is_transparent_background_ray = kernel_data.background.transparent &&
+ (INTEGRATOR_STATE(path, flag) &
+ PATH_RAY_TRANSPARENT_BACKGROUND);
+
+ if (is_transparent_background_ray) {
+ transparent = average(INTEGRATOR_STATE(path, throughput));
+
+#ifdef __PASSES__
+ eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND));
+#else
+ eval_background = false;
+#endif
+ }
+
+ /* Evaluate background shader. */
+ float3 L = (eval_background) ?
+ integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) :
+ zero_float3();
+
+ /* When using the ao bounces approximation, adjust background
+ * shader intensity with ao factor. */
+ if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+ L *= kernel_data.integrator.ao_bounces_factor;
+ }
+
+ /* Write to render buffer. */
+ kernel_accum_background(
+ INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer);
+}
+
+ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ const float3 ray_D = INTEGRATOR_STATE(ray, D);
+ const float ray_time = INTEGRATOR_STATE(ray, time);
+ LightSample ls ccl_optional_struct_init;
+ for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+ if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+ /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+ if (ls.shader & SHADER_EXCLUDE_ANY) {
+ if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+ ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+ ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+ (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+ ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+ ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+ ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+ return;
+ }
+#endif
+
+ /* Evaluate light shader. */
+ /* TODO: does aliasing like this break automatic SoA in CUDA? */
+ ShaderDataTinyStorage emission_sd_storage;
+ ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+ float3 light_eval = light_sample_shader_eval(
+ INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+ if (is_zero(light_eval)) {
+ return;
+ }
+
+ /* MIS weighting. */
+ if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+ /* multiple importance sampling, get regular light pdf,
+ * and compute weight with respect to BSDF pdf */
+ const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+ const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+ light_eval *= mis_weight;
+ }
+
+ /* Write to render buffer. */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+ }
+ }
+}
+
+ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+ /* TODO: unify these in a single loop to only have a single shader evaluation call. */
+ integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer);
+ integrate_background(INTEGRATOR_STATE_PASS, render_buffer);
+
+#ifdef __SHADOW_CATCHER__
+ if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+ INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+ const int isect_prim = INTEGRATOR_STATE(isect, prim);
+ const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim);
+ const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+ if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+ shader);
+ }
+ else {
+ INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+ shader);
+ }
+ return;
+ }
+#endif
+
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h
new file mode 100644
index 00000000000..05b530f9665
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_light.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ /* Setup light sample. */
+ Intersection isect ccl_optional_struct_init;
+ integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+ float3 ray_P = INTEGRATOR_STATE(ray, P);
+ const float3 ray_D = INTEGRATOR_STATE(ray, D);
+ const float ray_time = INTEGRATOR_STATE(ray, time);
+
+ /* Advance ray beyond light. */
+ /* TODO: can we make this more numerically robust to avoid reintersecting the
+ * same light in some cases? */
+ const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+ INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P;
+ INTEGRATOR_STATE_WRITE(ray, t) -= isect.t;
+
+ /* Set position to where the BSDF was sampled, for correct MIS PDF. */
+ const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+ ray_P -= ray_D * mis_ray_t;
+ isect.t += mis_ray_t;
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t;
+
+ LightSample ls ccl_optional_struct_init;
+ const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
+
+ if (!use_light_sample) {
+ return;
+ }
+
+ /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+ if (ls.shader & SHADER_EXCLUDE_ANY) {
+ if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+ ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+ ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+ (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+ ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+ ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+ return;
+ }
+#endif
+
+ /* Evaluate light shader. */
+ /* TODO: does aliasing like this break automatic SoA in CUDA? */
+ ShaderDataTinyStorage emission_sd_storage;
+ ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+ float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+ if (is_zero(light_eval)) {
+ return;
+ }
+
+ /* MIS weighting. */
+ if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+ /* multiple importance sampling, get regular light pdf,
+ * and compute weight with respect to BSDF pdf */
+ const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+ const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+ light_eval *= mis_weight;
+ }
+
+ /* Write to render buffer. */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+}
+
+ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+ integrate_light(INTEGRATOR_STATE_PASS, render_buffer);
+
+ /* TODO: we could get stuck in an infinite loop if there are precision issues
+ * and the same light is hit again.
+ *
+ * As a workaround count this as a transparent bounce. It makes some sense
+ * to interpret lights as transparent surfaces (and support making them opaque),
+ * but this needs to be revisited. */
+ uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+
+ if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+ return;
+ }
+ else {
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ return;
+ }
+
+ /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but
+ * probably that optimization is probably not practical if we add lights to
+ * scene geometry. */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
new file mode 100644
index 00000000000..fd3c3ae1653
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
+{
+ return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE;
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
+
+ /* TODO: does aliasing like this break automatic SoA in CUDA?
+ * Should we instead store closures separate from ShaderData?
+ *
+ * TODO: is it better to declare this outside the loop or keep it local
+ * so the compiler can see there is no dependency between iterations? */
+ ShaderDataTinyStorage shadow_sd_storage;
+ ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+ /* Setup shader data at surface. */
+ Intersection isect ccl_optional_struct_init;
+ integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit);
+
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ shader_setup_from_ray(kg, shadow_sd, &ray, &isect);
+
+ /* Evaluate shader. */
+ if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+ shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+ INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW);
+ }
+
+# ifdef __VOLUME__
+ /* Exit/enter volume. */
+ shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd);
+# endif
+
+ /* Compute transparency from closures. */
+ return shader_bsdf_transparency(kg, shadow_sd);
+}
+
+# ifdef __VOLUME__
+ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS,
+ const int hit,
+ const int num_recorded_hits,
+ float3 *ccl_restrict throughput)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
+
+ /* TODO: deduplicate with surface, or does it not matter for memory usage? */
+ ShaderDataTinyStorage shadow_sd_storage;
+ ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+ /* Setup shader data. */
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Modify ray position and length to match current segment. */
+ const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t);
+ const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) :
+ ray.t;
+ ray.P += start_t * ray.D;
+ ray.t = end_t - start_t;
+
+ shader_setup_from_volume(kg, shadow_sd, &ray);
+
+ const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+ return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+ });
+
+ volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size);
+}
+# endif
+
+ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits)
+{
+ /* Accumulate shadow for transparent surfaces. */
+ const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+
+ for (int hit = 0; hit < num_recorded_hits + 1; hit++) {
+ /* Volume shaders. */
+ if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
+# ifdef __VOLUME__
+ if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+ float3 throughput = INTEGRATOR_STATE(shadow_path, throughput);
+ integrate_transparent_volume_shadow(
+ INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput);
+ if (is_zero(throughput)) {
+ return true;
+ }
+
+ INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+ }
+# endif
+ }
+
+ /* Surface shaders. */
+ if (hit < num_recorded_hits) {
+ const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit);
+ const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow;
+ if (is_zero(throughput)) {
+ return true;
+ }
+
+ INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+ INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1;
+ }
+
+ /* Note we do not need to check max_transparent_bounce here, the number
+ * of intersections is already limited and made opaque in the
+ * INTERSECT_SHADOW kernel. */
+ }
+
+ if (shadow_intersections_has_remaining(num_hits)) {
+ /* There are more hits that we could not recorded due to memory usage,
+ * adjust ray to intersect again from the last hit. */
+ const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t);
+ const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P);
+ const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D);
+ INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+ INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t;
+ }
+
+ return false;
+}
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
+ const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits);
+
+#ifdef __TRANSPARENT_SHADOWS__
+ /* Evaluate transparent shadows. */
+ const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits);
+ if (opaque) {
+ INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return;
+ }
+#endif
+
+ if (shadow_intersections_has_remaining(num_hits)) {
+ /* More intersections to find, continue shadow ray. */
+ INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+ return;
+ }
+ else {
+ kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer);
+ INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+ return;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
new file mode 100644
index 00000000000..73b7cad32be
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *sd)
+{
+ Intersection isect ccl_optional_struct_init;
+ integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ shader_setup_from_ray(kg, sd, &ray, &isect);
+}
+
+#ifdef __HOLDOUT__
+ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *sd,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ /* Write holdout transparency to render buffer and stop if fully holdout. */
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+ if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+ (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+ const float3 holdout_weight = shader_holdout_apply(kg, sd);
+ if (kernel_data.background.transparent) {
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ const float transparent = average(holdout_weight * throughput);
+ kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+ }
+ if (isequal_float3(holdout_weight, one_float3())) {
+ return false;
+ }
+ }
+
+ return true;
+}
+#endif /* __HOLDOUT__ */
+
+#ifdef __EMISSION__
+ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS,
+ const ShaderData *sd,
+ ccl_global float *ccl_restrict
+ render_buffer)
+{
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+ /* Evaluate emissive closure. */
+ float3 L = shader_emissive_eval(sd);
+
+# ifdef __HAIR__
+ if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
+ (sd->type & PRIMITIVE_ALL_TRIANGLE))
+# else
+ if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+# endif
+ {
+ const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+ const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t);
+
+ /* Multiple importance sampling, get triangle light pdf,
+ * and compute weight with respect to BSDF pdf. */
+ float pdf = triangle_light_pdf(kg, sd, t);
+ float mis_weight = power_heuristic(bsdf_pdf, pdf);
+
+ L *= mis_weight;
+ }
+
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer);
+}
+#endif /* __EMISSION__ */
+
+#ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS,
+ ShaderData *sd,
+ const RNGState *rng_state)
+{
+ /* Test if there is a light or BSDF that needs direct light. */
+ if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) {
+ return;
+ }
+
+ /* Sample position on a light. */
+ LightSample ls ccl_optional_struct_init;
+ {
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ const uint bounce = INTEGRATOR_STATE(path, bounce);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ if (!light_distribution_sample_from_position(
+ kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+ return;
+ }
+ }
+
+ kernel_assert(ls.pdf != 0.0f);
+
+ /* Evaluate light shader.
+ *
+ * TODO: can we reuse sd memory? In theory we can move this after
+ * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+ * the light shader. This could also move to its own kernel, for
+ * non-constant light sources. */
+ ShaderDataTinyStorage emission_sd_storage;
+ ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+ const float3 light_eval = light_sample_shader_eval(
+ INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time);
+ if (is_zero(light_eval)) {
+ return;
+ }
+
+ /* Evaluate BSDF. */
+ const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+
+ BsdfEval bsdf_eval ccl_optional_struct_init;
+ const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+ bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+
+ if (ls.shader & SHADER_USE_MIS) {
+ const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+ bsdf_eval_mul(&bsdf_eval, mis_weight);
+ }
+
+ /* Path termination. */
+ const float terminate = path_state_rng_light_termination(kg, rng_state);
+ if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) {
+ return;
+ }
+
+ /* Create shadow ray. */
+ Ray ray ccl_optional_struct_init;
+ light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
+ const bool is_light = light_sample_is_light(&ls);
+
+ /* Copy volume stack and enter/exit volume. */
+ integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+ if (is_transmission) {
+# ifdef __VOLUME__
+ shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd);
+# endif
+ }
+
+ /* Write shadow ray and associated state to global memory. */
+ integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Copy state from main path to shadow path. */
+ const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+ const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+ uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+ shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+ shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+ const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ const float3 diffuse_glossy_ratio = (bounce == 0) ?
+ bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+ INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+ }
+
+ INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+ INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+ INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+ INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+ INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+ }
+
+ /* Branch off shadow kernel. */
+ INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#endif
+
+/* Path tracing: bounce off or through surface with new direction. */
+ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS,
+ ShaderData *sd,
+ const RNGState *rng_state)
+{
+ /* Sample BSDF or BSSRDF. */
+ if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) {
+ return LABEL_NONE;
+ }
+
+ float bsdf_u, bsdf_v;
+ path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+ const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+
+#ifdef __SUBSURFACE__
+ /* BSSRDF closure, we schedule subsurface intersection kernel. */
+ if (CLOSURE_IS_BSSRDF(sc->type)) {
+ return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc);
+ }
+#endif
+
+ /* BSDF closure, sample direction. */
+ float bsdf_pdf;
+ BsdfEval bsdf_eval ccl_optional_struct_init;
+ float3 bsdf_omega_in ccl_optional_struct_init;
+ differential3 bsdf_domega_in ccl_optional_struct_init;
+ int label;
+
+ label = shader_bsdf_sample_closure(
+ kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+ if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+ return LABEL_NONE;
+ }
+
+ /* Setup ray. Note that clipping works through transparent bounces. */
+ INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+ INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in);
+ INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ?
+ INTEGRATOR_STATE(ray, t) - sd->ray_length :
+ FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+ INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+ INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in);
+#endif
+
+ /* Update throughput. */
+ float3 throughput = INTEGRATOR_STATE(path, throughput);
+ throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+ INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ if (INTEGRATOR_STATE(path, bounce) == 0) {
+ INTEGRATOR_STATE_WRITE(path,
+ diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+ }
+ }
+
+ /* Update path state */
+ if (label & LABEL_TRANSPARENT) {
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+ }
+ else {
+ INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf;
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+ INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf,
+ INTEGRATOR_STATE(path, min_ray_pdf));
+ }
+
+ path_state_next(INTEGRATOR_STATE_PASS, label);
+ return label;
+}
+
+#ifdef __VOLUME__
+ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS,
+ ShaderData *sd)
+{
+ if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) {
+ return LABEL_NONE;
+ }
+
+ /* Setup ray position, direction stays unchanged. */
+ INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng);
+
+ /* Clipping works through transparent. */
+ INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length;
+
+# ifdef __RAY_DIFFERENTIALS__
+ INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+# endif
+
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+
+ return LABEL_TRANSMIT | LABEL_TRANSPARENT;
+}
+#endif
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+ const ShaderData *ccl_restrict sd,
+ const RNGState *ccl_restrict rng_state,
+ ccl_global float *ccl_restrict render_buffer)
+{
+# ifdef __KERNEL_OPTIX__
+ optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer);
+}
+
+extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+ const ShaderData *ccl_restrict sd,
+ const RNGState *ccl_restrict rng_state,
+ ccl_global float *ccl_restrict render_buffer)
+{
+# endif /* __KERNEL_OPTIX__ */
+ float bsdf_u, bsdf_v;
+ path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+ const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+ float3 ao_D;
+ float ao_pdf;
+ sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+ if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+ Ray ray ccl_optional_struct_init;
+ ray.P = ray_offset(sd->P, sd->Ng);
+ ray.D = ao_D;
+ ray.t = kernel_data.integrator.ao_bounces_distance;
+ ray.time = sd->time;
+ ray.dP = differential_zero_compact();
+ ray.dD = differential_zero_compact();
+
+ Intersection isect ccl_optional_struct_init;
+ if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
+ ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
+ }
+ }
+}
+#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+
+template<uint node_feature_mask>
+ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+
+{
+ PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP);
+
+ /* Setup shader data. */
+ ShaderData sd;
+ integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd);
+ PROFILING_SHADER(sd.object, sd.shader);
+
+ int continue_path_label = 0;
+
+ /* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+ if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+ {
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+#ifdef __SUBSURFACE__
+ /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+ if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+#endif
+ {
+ /* Evaluate shader. */
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+ shader_eval_surface<node_feature_mask>(
+ INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
+ }
+ }
+
+#ifdef __SUBSURFACE__
+ if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+ /* When coming from inside subsurface scattering, setup a diffuse
+ * closure to perform lighting at the exit point. */
+ INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE;
+ subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd);
+ }
+#endif
+
+ shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd);
+
+#ifdef __HOLDOUT__
+ /* Evaluate holdout. */
+ if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) {
+ return false;
+ }
+#endif
+
+#ifdef __EMISSION__
+ /* Write emission. */
+ if (sd.flag & SD_EMISSION) {
+ integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+ }
+#endif
+
+#ifdef __PASSES__
+ /* Write render passes. */
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
+ kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+ /* Load random number state. */
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+ /* Perform path termination. Most paths have already been terminated in
+ * the intersect_closest kernel, this is just for emission and for dividing
+ * throughput by the probability at the right moment. */
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+ 0.0f :
+ path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+ path_flag);
+ if (probability == 0.0f) {
+ return false;
+ }
+ else if (probability != 1.0f) {
+ INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+ }
+
+#ifdef __DENOISING_FEATURES__
+ kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+#ifdef __SHADOW_CATCHER__
+ kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+ /* Direct light. */
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
+ integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state);
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ /* Ambient occlusion pass. */
+ if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
+ if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+ (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) {
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+ integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer);
+ }
+ }
+#endif
+
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+ continue_path_label = integrate_surface_bsdf_bssrdf_bounce(
+ INTEGRATOR_STATE_PASS, &sd, &rng_state);
+#ifdef __VOLUME__
+ }
+ else {
+ PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+ continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd);
+ }
+
+ if (continue_path_label & LABEL_TRANSMIT) {
+ /* Enter/Exit volume. */
+ volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd);
+ }
+#endif
+
+ return continue_path_label != 0;
+}
+
+template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
+ int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) {
+ if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+ INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+ }
+ else {
+ kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f);
+ INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ }
+ }
+ else {
+ INTEGRATOR_PATH_TERMINATE(current_kernel);
+ }
+}
+
+ccl_device_forceinline void integrator_shade_surface_raytrace(
+ INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+ integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS,
+ render_buffer);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
new file mode 100644
index 00000000000..095a28ac505
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -0,0 +1,1019 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Events for probabilistic scattering. */
+
+typedef enum VolumeIntegrateEvent {
+ VOLUME_PATH_SCATTERED = 0,
+ VOLUME_PATH_ATTENUATED = 1,
+ VOLUME_PATH_MISSED = 2
+} VolumeIntegrateEvent;
+
+typedef struct VolumeIntegrateResult {
+ /* Throughput and offset for direct light scattering. */
+ bool direct_scatter;
+ float3 direct_throughput;
+ float direct_t;
+ ShaderVolumePhases direct_phases;
+
+ /* Throughput and offset for indirect light scattering. */
+ bool indirect_scatter;
+ float3 indirect_throughput;
+ float indirect_t;
+ ShaderVolumePhases indirect_phases;
+} VolumeIntegrateResult;
+
+/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
+ * and precision issues.
+ * todo: this value could be tweaked or turned into a probability to avoid unnecessary
+ * work in volumes and subsurface scattering. */
+# define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+/* Volume shader properties
+ *
+ * extinction coefficient = absorption coefficient + scattering coefficient
+ * sigma_t = sigma_a + sigma_s */
+
+typedef struct VolumeShaderCoefficients {
+ float3 sigma_t;
+ float3 sigma_s;
+ float3 emission;
+} VolumeShaderCoefficients;
+
+/* Evaluate shader to get extinction coefficient at P. */
+ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
+ ShaderData *ccl_restrict sd,
+ float3 *ccl_restrict extinction)
+{
+ shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+ return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+ });
+
+ if (!(sd->flag & SD_EXTINCTION)) {
+ return false;
+ }
+
+ const float density = object_volume_density(kg, sd->object);
+ *extinction = sd->closure_transparent_extinction * density;
+ return true;
+}
+
+/* Evaluate shader to get absorption, scattering and emission at P. */
+ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
+ ShaderData *ccl_restrict sd,
+ VolumeShaderCoefficients *coeff)
+{
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+ return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+ });
+
+ if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
+ return false;
+ }
+
+ coeff->sigma_s = zero_float3();
+ coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
+ coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+
+ if (sd->flag & SD_SCATTER) {
+ for (int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if (CLOSURE_IS_VOLUME(sc->type)) {
+ coeff->sigma_s += sc->weight;
+ }
+ }
+ }
+
+ const float density = object_volume_density(kg, sd->object);
+ coeff->sigma_s *= density;
+ coeff->sigma_t *= density;
+ coeff->emission *= density;
+
+ return true;
+}
+
+ccl_device_forceinline void volume_step_init(const KernelGlobals *kg,
+ const RNGState *rng_state,
+ const float object_step_size,
+ float t,
+ float *step_size,
+ float *step_shade_offset,
+ float *steps_offset,
+ int *max_steps)
+{
+ if (object_step_size == FLT_MAX) {
+ /* Homogeneous volume. */
+ *step_size = t;
+ *step_shade_offset = 0.0f;
+ *steps_offset = 1.0f;
+ *max_steps = 1;
+ }
+ else {
+ /* Heterogeneous volume. */
+ *max_steps = kernel_data.integrator.volume_max_steps;
+ float step = min(object_step_size, t);
+
+ /* compute exact steps in advance for malloc */
+ if (t > *max_steps * step) {
+ step = t / (float)*max_steps;
+ }
+
+ *step_size = step;
+
+ /* Perform shading at this offset within a step, to integrate over
+ * over the entire step segment. */
+ *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+
+ /* Shift starting point of all segment by this random amount to avoid
+ * banding artifacts from the volume bounding shape. */
+ *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+ }
+}
+
+/* Volume Shadows
+ *
+ * These functions are used to attenuate shadow rays to lights. Both absorption
+ * and scattering will block light, represented by the extinction coefficient. */
+
+# if 0
+/* homogeneous volume: assume shader evaluation at the starts gives
+ * the extinction coefficient for the entire line segment */
+ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS,
+ Ray *ccl_restrict ray,
+ ShaderData *ccl_restrict sd,
+ float3 *ccl_restrict throughput)
+{
+ float3 sigma_t = zero_float3();
+
+ if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+ *throughput *= volume_color_transmittance(sigma_t, ray->t);
+ }
+}
+# endif
+
+/* heterogeneous volume: integrate stepping through the volume until we
+ * reach the end, get absorbed entirely, or run out of iterations */
+ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS,
+ Ray *ccl_restrict ray,
+ ShaderData *ccl_restrict sd,
+ float3 *ccl_restrict throughput,
+ const float object_step_size)
+{
+ /* Load random number state. */
+ RNGState rng_state;
+ shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+ float3 tp = *throughput;
+
+ /* Prepare for stepping.
+ * For shadows we do not offset all segments, since the starting point is
+ * already a random distance inside the volume. It also appears to create
+ * banding artifacts for unknown reasons. */
+ int max_steps;
+ float step_size, step_shade_offset, unused;
+ volume_step_init(kg,
+ &rng_state,
+ object_step_size,
+ ray->t,
+ &step_size,
+ &step_shade_offset,
+ &unused,
+ &max_steps);
+ const float steps_offset = 1.0f;
+
+ /* compute extinction at the start */
+ float t = 0.0f;
+
+ float3 sum = zero_float3();
+
+ for (int i = 0; i < max_steps; i++) {
+ /* advance to new position */
+ float new_t = min(ray->t, (i + steps_offset) * step_size);
+ float dt = new_t - t;
+
+ float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
+ float3 sigma_t = zero_float3();
+
+ /* compute attenuation over segment */
+ sd->P = new_P;
+ if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+ /* Compute `expf()` only for every Nth step, to save some calculations
+ * because `exp(a)*exp(b) = exp(a+b)`, also do a quick #VOLUME_THROUGHPUT_EPSILON
+ * check then. */
+ sum += (-sigma_t * dt);
+ if ((i & 0x07) == 0) { /* ToDo: Other interval? */
+ tp = *throughput * exp3(sum);
+
+ /* stop if nearly all light is blocked */
+ if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
+ tp.z < VOLUME_THROUGHPUT_EPSILON)
+ break;
+ }
+ }
+
+ /* stop if at the end of the volume */
+ t = new_t;
+ if (t == ray->t) {
+ /* Update throughput in case we haven't done it above */
+ tp = *throughput * exp3(sum);
+ break;
+ }
+ }
+
+ *throughput = tp;
+}
+
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray,
+ const float3 light_P,
+ const float xi,
+ float *pdf)
+{
+ const float t = ray->t;
+ const float delta = dot((light_P - ray->P), ray->D);
+ const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+ if (UNLIKELY(D == 0.0f)) {
+ *pdf = 0.0f;
+ return 0.0f;
+ }
+ const float theta_a = -atan2f(delta, D);
+ const float theta_b = atan2f(t - delta, D);
+ const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+ if (UNLIKELY(theta_b == theta_a)) {
+ *pdf = 0.0f;
+ return 0.0f;
+ }
+ *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+ return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray,
+ const float3 light_P,
+ const float sample_t)
+{
+ const float delta = dot((light_P - ray->P), ray->D);
+ const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+ if (UNLIKELY(D == 0.0f)) {
+ return 0.0f;
+ }
+
+ const float t = ray->t;
+ const float t_ = sample_t - delta;
+
+ const float theta_a = -atan2f(delta, D);
+ const float theta_b = atan2f(t - delta, D);
+ if (UNLIKELY(theta_b == theta_a)) {
+ return 0.0f;
+ }
+
+ const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+ return pdf;
+}
+
+ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray,
+ const float3 light_P,
+ const float sample_t)
+{
+ float delta = dot((light_P - ray->P), ray->D);
+ float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+ if (UNLIKELY(D == 0.0f)) {
+ return 0.0f;
+ }
+
+ const float t = ray->t;
+ const float t_ = sample_t - delta;
+
+ const float theta_a = -atan2f(delta, D);
+ const float theta_b = atan2f(t - delta, D);
+ if (UNLIKELY(theta_b == theta_a)) {
+ return 0.0f;
+ }
+
+ const float theta_sample = atan2f(t_, D);
+ const float cdf = (theta_sample - theta_a) / (theta_b - theta_a);
+
+ return cdf;
+}
+
+/* Distance sampling */
+
+ccl_device float volume_distance_sample(
+ float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+ /* xi is [0, 1[ so log(0) should never happen, division by zero is
+ * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+ float sample_sigma_t = volume_channel_get(sigma_t, channel);
+ float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+ float sample_transmittance = volume_channel_get(full_transmittance, channel);
+
+ float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
+
+ *transmittance = volume_color_transmittance(sigma_t, sample_t);
+ *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+
+ /* todo: optimization: when taken together with hit/miss decision,
+ * the full_transmittance cancels out drops out and xi does not
+ * need to be remapped */
+
+ return sample_t;
+}
+
+ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+ float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+ float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+ return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff,
+ int closure_flag,
+ float3 transmittance,
+ float t)
+{
+ /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+ * this goes to E * t as sigma_t goes to zero
+ *
+ * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+ float3 emission = coeff->emission;
+
+ if (closure_flag & SD_EXTINCTION) {
+ float3 sigma_t = coeff->sigma_t;
+
+ emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
+ emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
+ emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+ }
+ else
+ emission *= t;
+
+ return emission;
+}
+
+/* Volume Integration */
+
+typedef struct VolumeIntegrateState {
+ /* Volume segment extents. */
+ float start_t;
+ float end_t;
+
+ /* If volume is absorption-only up to this point, and no probabilistic
+ * scattering or termination has been used yet. */
+ bool absorption_only;
+
+ /* Random numbers for scattering. */
+ float rscatter;
+ float rphase;
+
+ /* Multiple importance sampling. */
+ VolumeSampleMethod direct_sample_method;
+ bool use_mis;
+ float distance_pdf;
+ float equiangular_pdf;
+} VolumeIntegrateState;
+
+ccl_device_forceinline void volume_integrate_step_scattering(
+ const ShaderData *sd,
+ const Ray *ray,
+ const float3 equiangular_light_P,
+ const VolumeShaderCoefficients &ccl_restrict coeff,
+ const float3 transmittance,
+ VolumeIntegrateState &ccl_restrict vstate,
+ VolumeIntegrateResult &ccl_restrict result)
+{
+ /* Pick random color channel, we use the Veach one-sample
+ * model with balance heuristic for the channels. */
+ const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+ float3 channel_pdf;
+ const int channel = volume_sample_channel(
+ albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
+
+ /* Equiangular sampling for direct lighting. */
+ if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
+ if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+ const float new_dt = result.direct_t - vstate.start_t;
+ const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+
+ result.direct_scatter = true;
+ result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
+ shader_copy_volume_phases(&result.direct_phases, sd);
+
+ /* Multiple importance sampling. */
+ if (vstate.use_mis) {
+ const float distance_pdf = vstate.distance_pdf *
+ dot(channel_pdf, coeff.sigma_t * new_transmittance);
+ const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf);
+ result.direct_throughput *= mis_weight;
+ }
+ }
+ else {
+ result.direct_throughput *= transmittance;
+ vstate.distance_pdf *= dot(channel_pdf, transmittance);
+ }
+ }
+
+ /* Distance sampling for indirect and optional direct lighting. */
+ if (!result.indirect_scatter) {
+ /* decide if we will scatter or continue */
+ const float sample_transmittance = volume_channel_get(transmittance, channel);
+
+ if (1.0f - vstate.rscatter >= sample_transmittance) {
+ /* compute sampling distance */
+ const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
+ const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
+ const float new_t = vstate.start_t + new_dt;
+
+ /* transmittance and pdf */
+ const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+ const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
+
+ /* throughput */
+ result.indirect_scatter = true;
+ result.indirect_t = new_t;
+ result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+ shader_copy_volume_phases(&result.indirect_phases, sd);
+
+ if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+ /* If using distance sampling for direct light, just copy parameters
+ * of indirect light since we scatter at the same point then. */
+ result.direct_scatter = true;
+ result.direct_t = result.indirect_t;
+ result.direct_throughput = result.indirect_throughput;
+ shader_copy_volume_phases(&result.direct_phases, sd);
+
+ /* Multiple importance sampling. */
+ if (vstate.use_mis) {
+ const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+ const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+ equiangular_pdf);
+ result.direct_throughput *= 2.0f * mis_weight;
+ }
+ }
+ }
+ else {
+ /* throughput */
+ const float pdf = dot(channel_pdf, transmittance);
+ result.indirect_throughput *= transmittance / pdf;
+ if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+ vstate.distance_pdf *= pdf;
+ }
+
+ /* remap rscatter so we can reuse it and keep thing stratified */
+ vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance;
+ }
+ }
+}
+
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probabilistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device_forceinline void volume_integrate_heterogeneous(
+ INTEGRATOR_STATE_ARGS,
+ Ray *ccl_restrict ray,
+ ShaderData *ccl_restrict sd,
+ const RNGState *rng_state,
+ ccl_global float *ccl_restrict render_buffer,
+ const float object_step_size,
+ const VolumeSampleMethod direct_sample_method,
+ const float3 equiangular_light_P,
+ VolumeIntegrateResult &result)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);
+
+ /* Prepare for stepping.
+ * Using a different step offset for the first step avoids banding artifacts. */
+ int max_steps;
+ float step_size, step_shade_offset, steps_offset;
+ volume_step_init(kg,
+ rng_state,
+ object_step_size,
+ ray->t,
+ &step_size,
+ &step_shade_offset,
+ &steps_offset,
+ &max_steps);
+
+ /* Initialize volume integration state. */
+ VolumeIntegrateState vstate ccl_optional_struct_init;
+ vstate.start_t = 0.0f;
+ vstate.end_t = 0.0f;
+ vstate.absorption_only = true;
+ vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
+ vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+
+ /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
+ vstate.direct_sample_method = direct_sample_method;
+ vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
+ if (vstate.use_mis) {
+ if (vstate.rscatter < 0.5f) {
+ vstate.rscatter *= 2.0f;
+ vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+ }
+ else {
+ vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
+ vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
+ }
+ }
+ vstate.equiangular_pdf = 0.0f;
+ vstate.distance_pdf = 1.0f;
+
+ /* Initialize volume integration result. */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ result.direct_throughput = throughput;
+ result.indirect_throughput = throughput;
+
+ /* Equiangular sampling: compute distance and PDF in advance. */
+ if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) {
+ result.direct_t = volume_equiangular_sample(
+ ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf);
+ }
+
+# ifdef __DENOISING_FEATURES__
+ const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) &
+ PATH_RAY_DENOISING_FEATURES);
+ float3 accum_albedo = zero_float3();
+# endif
+ float3 accum_emission = zero_float3();
+
+ for (int i = 0; i < max_steps; i++) {
+ /* Advance to new position */
+ vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
+ const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+ sd->P = ray->P + ray->D * shade_t;
+
+ /* compute segment */
+ VolumeShaderCoefficients coeff ccl_optional_struct_init;
+ if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) {
+ const int closure_flag = sd->flag;
+
+ /* Evaluate transmittance over segment. */
+ const float dt = (vstate.end_t - vstate.start_t);
+ const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
+ volume_color_transmittance(coeff.sigma_t, dt) :
+ one_float3();
+
+ /* Emission. */
+ if (closure_flag & SD_EMISSION) {
+ /* Only write emission before indirect light scatter position, since we terminate
+ * stepping at that point if we have already found a direct light scatter position. */
+ if (!result.indirect_scatter) {
+ const float3 emission = volume_emission_integrate(
+ &coeff, closure_flag, transmittance, dt);
+ accum_emission += emission;
+ }
+ }
+
+ if (closure_flag & SD_EXTINCTION) {
+ if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) {
+# ifdef __DENOISING_FEATURES__
+ /* Accumulate albedo for denoising features. */
+ if (write_denoising_features && (closure_flag & SD_SCATTER)) {
+ const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+ accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+ }
+# endif
+
+ /* Scattering and absorption. */
+ volume_integrate_step_scattering(
+ sd, ray, equiangular_light_P, coeff, transmittance, vstate, result);
+ }
+ else {
+ /* Absorption only. */
+ result.indirect_throughput *= transmittance;
+ result.direct_throughput *= transmittance;
+ }
+
+ /* Stop if nearly all light blocked. */
+ if (!result.indirect_scatter) {
+ if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+ result.indirect_throughput = zero_float3();
+ break;
+ }
+ }
+ else if (!result.direct_scatter) {
+ if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+ break;
+ }
+ }
+ }
+
+ /* If we have scattering data for both direct and indirect, we're done. */
+ if (result.direct_scatter && result.indirect_scatter) {
+ break;
+ }
+ }
+
+ /* Stop if at the end of the volume. */
+ vstate.start_t = vstate.end_t;
+ if (vstate.start_t == ray->t) {
+ break;
+ }
+ }
+
+ /* Write accumulated emission. */
+ if (!is_zero(accum_emission)) {
+ kernel_accum_emission(
+ INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer);
+ }
+
+# ifdef __DENOISING_FEATURES__
+ /* Write denoising features. */
+ if (write_denoising_features) {
+ kernel_write_denoising_features_volume(
+ INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer);
+ }
+# endif /* __DENOISING_FEATURES__ */
+}
+
+# ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS,
+ const ShaderData *ccl_restrict sd,
+ const RNGState *ccl_restrict rng_state,
+ LightSample *ccl_restrict ls)
+{
+ /* Test if there is a light or BSDF that needs direct light. */
+ if (!kernel_data.integrator.use_direct_light) {
+ return false;
+ }
+
+ /* Sample position on a light. */
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ const uint bounce = INTEGRATOR_STATE(path, bounce);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ light_distribution_sample_from_volume_segment(
+ kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);
+
+ if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+ return false;
+ }
+
+ return true;
+}
+
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
+ const ShaderData *ccl_restrict sd,
+ const RNGState *ccl_restrict rng_state,
+ const float3 P,
+ const ShaderVolumePhases *ccl_restrict
+ phases,
+ const float3 throughput,
+ LightSample *ccl_restrict ls)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
+
+ if (!kernel_data.integrator.use_direct_light) {
+ return;
+ }
+
+ /* Sample position on the same light again, now from the shading
+ * point where we scattered.
+ *
+ * TODO: decorrelate random numbers and use light_sample_new_position to
+ * avoid resampling the CDF. */
+ {
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ const uint bounce = INTEGRATOR_STATE(path, bounce);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+ if (!light_distribution_sample_from_position(
+ kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+ return;
+ }
+ }
+
+ if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+ return;
+ }
+
+ /* Evaluate light shader.
+ *
+ * TODO: can we reuse sd memory? In theory we can move this after
+ * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+ * the light shader. This could also move to its own kernel, for
+ * non-constant light sources. */
+ ShaderDataTinyStorage emission_sd_storage;
+ ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+ const float3 light_eval = light_sample_shader_eval(
+ INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time);
+ if (is_zero(light_eval)) {
+ return;
+ }
+
+ /* Evaluate BSDF. */
+ BsdfEval phase_eval ccl_optional_struct_init;
+ const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+
+ if (ls->shader & SHADER_USE_MIS) {
+ float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+ bsdf_eval_mul(&phase_eval, mis_weight);
+ }
+
+ bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+
+ /* Path termination. */
+ const float terminate = path_state_rng_light_termination(kg, rng_state);
+ if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+ return;
+ }
+
+ /* Create shadow ray. */
+ Ray ray ccl_optional_struct_init;
+ light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
+ const bool is_light = light_sample_is_light(ls);
+
+ /* Write shadow ray and associated state to global memory. */
+ integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Copy state from main path to shadow path. */
+ const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+ const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+ uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+ shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+ shadow_flag |= PATH_RAY_VOLUME_PASS;
+ const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ const float3 diffuse_glossy_ratio = (bounce == 0) ?
+ one_float3() :
+ INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+ }
+
+ INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+ INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+ INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+ INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase;
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+ INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+ }
+
+ integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+ /* Branch off shadow kernel. */
+ INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+# endif
+
+/* Path tracing: scatter in new direction using phase function */
+ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS,
+ ShaderData *sd,
+ const RNGState *rng_state,
+ const ShaderVolumePhases *phases)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
+
+ float phase_u, phase_v;
+ path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+
+ /* Phase closure, sample direction. */
+ float phase_pdf;
+ BsdfEval phase_eval ccl_optional_struct_init;
+ float3 phase_omega_in ccl_optional_struct_init;
+ differential3 phase_domega_in ccl_optional_struct_init;
+
+ const int label = shader_volume_phase_sample(kg,
+ sd,
+ phases,
+ phase_u,
+ phase_v,
+ &phase_eval,
+ &phase_omega_in,
+ &phase_domega_in,
+ &phase_pdf);
+
+ if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+ return false;
+ }
+
+ /* Setup ray. */
+ INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+ INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in);
+ INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+
+# ifdef __RAY_DIFFERENTIALS__
+ INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+ INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in);
+# endif
+
+ /* Update throughput. */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+ INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase;
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+ }
+
+ /* Update path state */
+ INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+ INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf,
+ INTEGRATOR_STATE(path, min_ray_pdf));
+
+ path_state_next(INTEGRATOR_STATE_PASS, label);
+ return true;
+}
+
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS,
+ Ray *ccl_restrict ray,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ ShaderData sd;
+ shader_setup_from_volume(kg, &sd, ray);
+
+ /* Load random number state. */
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+ /* Sample light ahead of volume stepping, for equiangular sampling. */
+ /* TODO: distant lights are ignored now, but could instead use even distribution. */
+ LightSample ls ccl_optional_struct_init;
+ const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE);
+ const bool have_equiangular_sample = need_light_sample &&
+ integrate_volume_sample_light(
+ INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) &&
+ (ls.t != FLT_MAX);
+
+ VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
+ volume_stack_sample_method(INTEGRATOR_STATE_PASS) :
+ VOLUME_SAMPLE_DISTANCE;
+
+ /* Step through volume. */
+ const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+ return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+ });
+
+ /* TODO: expensive to zero closures? */
+ VolumeIntegrateResult result = {};
+ volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS,
+ ray,
+ &sd,
+ &rng_state,
+ render_buffer,
+ step_size,
+ direct_sample_method,
+ ls.P,
+ result);
+
+ /* Perform path termination. The intersect_closest will have already marked this path
+ * to be terminated. That will shading evaluating to leave out any scattering closures,
+ * but emission and absorption are still handled for multiple importance sampling. */
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+ const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+ 0.0f :
+ path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+ path_flag);
+ if (probability == 0.0f) {
+ return VOLUME_PATH_MISSED;
+ }
+
+ /* Direct light. */
+ if (result.direct_scatter) {
+ const float3 direct_P = ray->P + result.direct_t * ray->D;
+ result.direct_throughput /= probability;
+ integrate_volume_direct_light(INTEGRATOR_STATE_PASS,
+ &sd,
+ &rng_state,
+ direct_P,
+ &result.direct_phases,
+ result.direct_throughput,
+ &ls);
+ }
+
+ /* Indirect light.
+ *
+ * Only divide throughput by probability if we scatter. For the attenuation
+ * case the next surface will already do this division. */
+ if (result.indirect_scatter) {
+ result.indirect_throughput /= probability;
+ }
+ INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput;
+
+ if (result.indirect_scatter) {
+ sd.P = ray->P + result.indirect_t * ray->D;
+
+ if (integrate_volume_phase_scatter(
+ INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) {
+ return VOLUME_PATH_SCATTERED;
+ }
+ else {
+ return VOLUME_PATH_MISSED;
+ }
+ }
+ else {
+ return VOLUME_PATH_ATTENUATED;
+ }
+}
+
+#endif
+
+ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP);
+
+#ifdef __VOLUME__
+ /* Setup shader data. */
+ Ray ray ccl_optional_struct_init;
+ integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ Intersection isect ccl_optional_struct_init;
+ integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+ /* Set ray length to current segment. */
+ ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+
+ /* Clean volume stack for background rays. */
+ if (isect.prim == PRIM_NONE) {
+ volume_stack_clean(INTEGRATOR_STATE_PASS);
+ }
+
+ VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer);
+
+ if (event == VOLUME_PATH_SCATTERED) {
+ /* Queue intersect_closest kernel. */
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ return;
+ }
+ else if (event == VOLUME_PATH_MISSED) {
+ /* End path. */
+ INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+ return;
+ }
+ else {
+ /* Continue to background, light or surface. */
+ if (isect.prim == PRIM_NONE) {
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ return;
+ }
+ else if (isect.type & PRIMITIVE_LAMP) {
+ INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+ return;
+ }
+ else {
+ /* Hit a surface, continue with surface kernel unless terminated. */
+ const int shader = intersection_get_shader(kg, &isect);
+ const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+ integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+ INTEGRATOR_STATE_PASS, &isect, shader, flags);
+ return;
+ }
+ }
+#endif /* __VOLUME__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
new file mode 100644
index 00000000000..094446be02c
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Integrator State
+ *
+ * This file defines the data structures that define the state of a path. Any state that is
+ * preserved and passed between kernel executions is part of this.
+ *
+ * The size of this state must be kept as small as possible, to reduce cache misses and keep memory
+ * usage under control on GPUs that may execute millions of kernels.
+ *
+ * Memory may be allocated and passed along in different ways depending on the device. There may
+ * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
+ * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
+ * these differences between devices and experiment with different layouts, macros are used.
+ *
+ * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses
+ * path state.
+ * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only
+ * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it.
+ *
+ * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState
+ * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState
+ *
+ * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y
+ * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y
+ *
+ * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another
+ *
+ * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation
+ * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions.
+ *
+ * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem.
+ * Can solve it with more macros if we encounter it, but rather ugly so postpone for now.
+ */
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_types.h"
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants
+ *
+ * TODO: these could be made dynamic depending on the features used in the scene. */
+
+#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
+#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+/* Data structures */
+
+/* Integrator State
+ *
+ * CPU rendering path state with AoS layout. */
+typedef struct IntegratorStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+ } \
+ name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+ } \
+ name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorStateCPU;
+
+/* Path Queue
+ *
+ * Keep track of which kernels are queued to be executed next in the path
+ * for GPU rendering. */
+typedef struct IntegratorQueueCounter {
+ int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
+} IntegratorQueueCounter;
+
+/* Integrator State GPU
+ *
+ * GPU rendering path state with SoA layout. */
+typedef struct IntegratorStateGPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+ } \
+ name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+ } \
+ name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+
+ /* Count number of queued kernels. */
+ IntegratorQueueCounter *queue_counter;
+
+ /* Count number of kernels queued for specific shaders. */
+ int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
+
+ /* Index of path which will be used by a next shadow catcher split. */
+ int *next_shadow_catcher_path_index;
+} IntegratorStateGPU;
+
+/* Abstraction
+ *
+ * Macros to access data structures on different devices.
+ *
+ * Note that there is a special access function for the shadow catcher state. This access is to
+ * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
+ * from a kernel which operates on a shadow catcher state will cause bad memory access. */
+
+#ifdef __KERNEL_CPU__
+
+/* Scalar access on CPU. */
+
+typedef IntegratorStateCPU *ccl_restrict IntegratorState;
+
+# define INTEGRATOR_STATE_ARGS \
+ ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *ccl_restrict state
+# define INTEGRATOR_STATE_CONST_ARGS \
+ ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+ const IntegratorStateCPU *ccl_restrict state
+# define INTEGRATOR_STATE_PASS kg, state
+
+# define INTEGRATOR_STATE_PASS_NULL kg, NULL
+# define INTEGRATOR_STATE_IS_NULL (state == NULL)
+
+# define INTEGRATOR_STATE(nested_struct, member) \
+ (((const IntegratorStateCPU *)state)->nested_struct.member)
+# define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member)
+
+# define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+ (((const IntegratorStateCPU *)state)->nested_struct[array_index].member)
+# define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+ ((state)->nested_struct[array_index].member)
+
+#else /* __KERNEL_CPU__ */
+
+/* Array access on GPU with Structure-of-Arrays. */
+
+typedef int IntegratorState;
+
+# define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state
+# define INTEGRATOR_STATE_CONST_ARGS \
+ const KernelGlobals *ccl_restrict kg, const IntegratorState state
+# define INTEGRATOR_STATE_PASS kg, state
+
+# define INTEGRATOR_STATE_PASS_NULL kg, -1
+# define INTEGRATOR_STATE_IS_NULL (state == -1)
+
+# define INTEGRATOR_STATE(nested_struct, member) \
+ kernel_integrator_state.nested_struct.member[state]
+# define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member)
+
+# define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+ kernel_integrator_state.nested_struct[array_index].member[state]
+# define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+ INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member)
+
+#endif /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
new file mode 100644
index 00000000000..8477efd7b66
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+#include "util/util_atomic.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Control Flow
+ *
+ * Utilities for control flow between kernels. The implementation may differ per device
+ * or even be handled on the host side. To abstract such differences, experiment with
+ * different implementations and for debugging, this is abstracted using macros.
+ *
+ * There is a main path for regular path tracing camera for path tracing. Shadows for next
+ * event estimation branch off from this into their own path, that may be computed in
+ * parallel while the main path continues.
+ *
+ * Each kernel on the main path must call one of these functions. These may not be called
+ * multiple times from the same kernel.
+ *
+ * INTEGRATOR_PATH_INIT(next_kernel)
+ * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
+ * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ *
+ * For the shadow path similar functions are used, and again each shadow kernel must call
+ * one of them, and only once.
+ */
+
+#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0)
+#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0)
+
+#ifdef __KERNEL_GPU__
+
+# define INTEGRATOR_PATH_INIT(next_kernel) \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+ 1); \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+ atomic_fetch_and_sub_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+ 1); \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+# define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+ atomic_fetch_and_sub_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+
+# define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+ 1); \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+ atomic_fetch_and_sub_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+ 1); \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+ atomic_fetch_and_sub_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+
+# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+ { \
+ const int key_ = key; \
+ atomic_fetch_and_add_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+ INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+ 1); \
+ }
+# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+ { \
+ const int key_ = key; \
+ atomic_fetch_and_sub_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+ atomic_fetch_and_add_uint32( \
+ &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+ INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+ atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+ 1); \
+ }
+
+#else
+
+# define INTEGRATOR_PATH_INIT(next_kernel) \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+ { \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+ (void)key; \
+ }
+# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+ { \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+ (void)current_kernel; \
+ }
+# define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+ { \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \
+ (void)current_kernel; \
+ }
+# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+ { \
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+ (void)key; \
+ (void)current_kernel; \
+ }
+
+# define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+ { \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \
+ (void)current_kernel; \
+ }
+# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+ { \
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \
+ (void)current_kernel; \
+ }
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
new file mode 100644
index 00000000000..41dd1bfcdbf
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -0,0 +1,163 @@
+
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/************************************ Path State *****************************/
+
+KERNEL_STRUCT_BEGIN(path)
+/* Index of a pixel within the device render buffer where this path will write its result.
+ * To get an actual offset within the buffer the value needs to be multiplied by the
+ * `kernel_data.film.pass_stride`.
+ *
+ * The multiplication is delayed for later, so that state can use 32bit integer. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume bounds ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Multiple importance sampling
+ * The PDF of BSDF sampling at the last scatter point, and distance to the
+ * last scatter point minus the last ray segment. This distance lets us
+ * compute the complete distance through transparent surfaces and volumes. */
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
+/* Filter glossy. */
+KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+/* Shader sorting. */
+/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(path)
+
+/************************************** Ray ***********************************/
+
+KERNEL_STRUCT_BEGIN(ray)
+KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(ray)
+
+/*************************** Intersection result ******************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(isect)
+KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(isect)
+
+/*************** Subsurface closure state for subsurface kernel ***************/
+
+KERNEL_STRUCT_BEGIN(subsurface)
+KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_END(subsurface)
+
+/********************************** Volume Stack ******************************/
+
+KERNEL_STRUCT_BEGIN(volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
new file mode 100644
index 00000000000..cdf412fe22f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/kernel_differential.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray */
+
+ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS,
+ const Ray *ccl_restrict ray)
+{
+ INTEGRATOR_STATE_WRITE(ray, P) = ray->P;
+ INTEGRATOR_STATE_WRITE(ray, D) = ray->D;
+ INTEGRATOR_STATE_WRITE(ray, t) = ray->t;
+ INTEGRATOR_STATE_WRITE(ray, time) = ray->time;
+ INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP;
+ INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD;
+}
+
+ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS,
+ Ray *ccl_restrict ray)
+{
+ ray->P = INTEGRATOR_STATE(ray, P);
+ ray->D = INTEGRATOR_STATE(ray, D);
+ ray->t = INTEGRATOR_STATE(ray, t);
+ ray->time = INTEGRATOR_STATE(ray, time);
+ ray->dP = INTEGRATOR_STATE(ray, dP);
+ ray->dD = INTEGRATOR_STATE(ray, dD);
+}
+
+/* Shadow Ray */
+
+ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS,
+ const Ray *ccl_restrict ray)
+{
+ INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P;
+ INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D;
+ INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t;
+ INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time;
+ INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP;
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS,
+ Ray *ccl_restrict ray)
+{
+ ray->P = INTEGRATOR_STATE(shadow_ray, P);
+ ray->D = INTEGRATOR_STATE(shadow_ray, D);
+ ray->t = INTEGRATOR_STATE(shadow_ray, t);
+ ray->time = INTEGRATOR_STATE(shadow_ray, time);
+ ray->dP = INTEGRATOR_STATE(shadow_ray, dP);
+ ray->dD = differential_zero_compact();
+}
+
+/* Intersection */
+
+ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS,
+ const Intersection *ccl_restrict isect)
+{
+ INTEGRATOR_STATE_WRITE(isect, t) = isect->t;
+ INTEGRATOR_STATE_WRITE(isect, u) = isect->u;
+ INTEGRATOR_STATE_WRITE(isect, v) = isect->v;
+ INTEGRATOR_STATE_WRITE(isect, object) = isect->object;
+ INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim;
+ INTEGRATOR_STATE_WRITE(isect, type) = isect->type;
+#ifdef __EMBREE__
+ INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS,
+ Intersection *ccl_restrict isect)
+{
+ isect->prim = INTEGRATOR_STATE(isect, prim);
+ isect->object = INTEGRATOR_STATE(isect, object);
+ isect->type = INTEGRATOR_STATE(isect, type);
+ isect->u = INTEGRATOR_STATE(isect, u);
+ isect->v = INTEGRATOR_STATE(isect, v);
+ isect->t = INTEGRATOR_STATE(isect, t);
+#ifdef __EMBREE__
+ isect->Ng = INTEGRATOR_STATE(isect, Ng);
+#endif
+}
+
+ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS,
+ int i)
+{
+ VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object),
+ INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)};
+ return entry;
+}
+
+ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS,
+ int i,
+ VolumeStack entry)
+{
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object;
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader;
+}
+
+ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS)
+{
+ return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+ INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE :
+ true;
+}
+
+/* Shadow Intersection */
+
+ccl_device_forceinline void integrator_state_write_shadow_isect(
+ INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index)
+{
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type;
+#ifdef __EMBREE__
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS,
+ Intersection *ccl_restrict isect,
+ const int index)
+{
+ isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim);
+ isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object);
+ isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type);
+ isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u);
+ isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v);
+ isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t);
+#ifdef __EMBREE__
+ isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng);
+#endif
+}
+
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
+{
+ if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+ for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
+ volume_stack, i, object);
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
+ volume_stack, i, shader);
+ }
+ }
+}
+
+ccl_device_forceinline VolumeStack
+integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i)
+{
+ VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object),
+ INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)};
+ return entry;
+}
+
+ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
+ INTEGRATOR_STATE_CONST_ARGS)
+{
+ return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+ INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE :
+ true;
+}
+
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS,
+ int i,
+ VolumeStack entry)
+{
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object;
+ INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader;
+}
+
+#if defined(__KERNEL_GPU__)
+ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state,
+ const IntegratorState state)
+{
+ int index;
+
+ /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+# define KERNEL_STRUCT_BEGIN(name) \
+ index = 0; \
+ do {
+
+# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ if (kernel_integrator_state.parent_struct.name != nullptr) { \
+ kernel_integrator_state.parent_struct.name[to_state] = \
+ kernel_integrator_state.parent_struct.name[state]; \
+ }
+
+# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+ kernel_integrator_state.parent_struct[index].name[to_state] = \
+ kernel_integrator_state.parent_struct[index].name[state]; \
+ }
+
+# define KERNEL_STRUCT_END(name) \
+ } \
+ while (false) \
+ ;
+
+# define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+ ++index; \
+ } \
+ while (index < array_size) \
+ ;
+
+# include "kernel/integrator/integrator_state_template.h"
+
+# undef KERNEL_STRUCT_BEGIN
+# undef KERNEL_STRUCT_MEMBER
+# undef KERNEL_STRUCT_ARRAY_MEMBER
+# undef KERNEL_STRUCT_END
+# undef KERNEL_STRUCT_END_ARRAY
+}
+
+ccl_device_inline void integrator_state_move(const IntegratorState to_state,
+ const IntegratorState state)
+{
+ integrator_state_copy_only(to_state, state);
+
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
+
+#endif
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS)
+{
+#if defined(__KERNEL_GPU__)
+ const IntegratorState to_state = atomic_fetch_and_add_uint32(
+ &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
+
+ integrator_state_copy_only(to_state, state);
+
+ kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+ /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray
+ * and no sorting yet. */
+ kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+ kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] ==
+ nullptr);
+#else
+
+ IntegratorStateCPU *ccl_restrict split_state = state + 1;
+
+ *split_state = *state;
+
+ split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
new file mode 100644
index 00000000000..9490738404e
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
+
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+
+ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc)
+{
+ /* We should never have two consecutive BSSRDF bounces, the second one should
+ * be converted to a diffuse BSDF to avoid this. */
+ kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR));
+
+ /* Setup path state for intersect_subsurface kernel. */
+ const Bssrdf *bssrdf = (const Bssrdf *)sc;
+
+ /* Setup ray into surface. */
+ INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+ INTEGRATOR_STATE_WRITE(ray, D) = sd->N;
+ INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+ INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+ INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact();
+
+ /* Pass along object info, reusing isect to save memory. */
+ INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng;
+ INTEGRATOR_STATE_WRITE(isect, object) = sd->object;
+
+ /* Pass BSSRDF parameters. */
+ const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag);
+ INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE;
+ INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc);
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ if (INTEGRATOR_STATE(path, bounce) == 0) {
+ INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+ }
+ }
+
+ INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo;
+ INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius;
+ INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness;
+ INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy;
+
+ return LABEL_SUBSURFACE_SCATTER;
+}
+
+ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd)
+{
+ /* Get bump mapped normal from shader evaluation at exit point. */
+ float3 N = sd->N;
+ if (sd->flag & SD_HAS_BSSRDF_BUMP) {
+ N = shader_bssrdf_normal(sd);
+ }
+
+ /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
+ sd->flag &= ~SD_CLOSURE_FLAGS;
+ sd->num_closure = 0;
+ sd->num_closure_left = kernel_data.max_closures;
+
+ const float3 weight = one_float3();
+ const float roughness = INTEGRATOR_STATE(subsurface, roughness);
+
+# ifdef __PRINCIPLED__
+ if (roughness != FLT_MAX) {
+ PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
+ sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+ if (bsdf) {
+ bsdf->N = N;
+ bsdf->roughness = roughness;
+ sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular Disney principled diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+ }
+ }
+ else
+# endif /* __PRINCIPLED__ */
+ {
+ DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+ if (bsdf) {
+ bsdf->N = N;
+ sd->flag |= bsdf_diffuse_setup(bsdf);
+
+ /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+ * can recognize it as not being a regular diffuse closure */
+ bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+ }
+ }
+}
+
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+/* Support for anisotropy from:
+ * "Path Traced Subsurface Scattering using Anisotropic Phase Functions
+ * and Non-Exponential Free Flights".
+ * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery.
+ * https://graphics.pixar.com/library/PathTracedSubsurface/ */
+
+ccl_device void subsurface_random_walk_remap(
+ const float albedo, const float d, float g, float *sigma_t, float *alpha)
+{
+ /* Compute attenuation and scattering coefficients from albedo. */
+ const float g2 = g * g;
+ const float g3 = g2 * g;
+ const float g4 = g3 * g;
+ const float g5 = g4 * g;
+ const float g6 = g5 * g;
+ const float g7 = g6 * g;
+
+ const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 +
+ 9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 +
+ -23.6264803333f * g6 + 7.21067002658f * g7;
+ const float B = 4.98511194385f +
+ 0.127355959438f *
+ expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 +
+ -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 +
+ 559.009054474f * g7);
+ const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 +
+ -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 +
+ 38.5402837518f * g6 + -12.7181042538f * g7;
+ const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 +
+ 17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 +
+ -58.5106008578f * g6 + 15.8478295021f * g7;
+ const float E = 4.23190299701f +
+ 0.00310603949088f *
+ expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 +
+ -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 +
+ 1303.5318055f * g7);
+ const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 +
+ -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 +
+ 302.85712436f * g6 + -87.4370473567f * g7;
+
+ const float blend = powf(albedo, 0.25f);
+
+ *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) +
+ blend * D * powf(atanf(E * albedo), F);
+ *alpha = clamp(*alpha, 0.0f, 0.999999f); // because of numerical precision
+
+ float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f);
+ *sigma_t = sigma_t_prime / (1.0f - g);
+}
+
+ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
+ const float3 radius,
+ const float anisotropy,
+ float3 *sigma_t,
+ float3 *alpha,
+ float3 *throughput)
+{
+ float sigma_t_x, sigma_t_y, sigma_t_z;
+ float alpha_x, alpha_y, alpha_z;
+
+ subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
+ subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
+ subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+
+ /* Throughput already contains closure weight at this point, which includes the
+ * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
+ * which will be added through scattering. */
+ *throughput = safe_divide_color(*throughput, albedo);
+
+ /* With low albedo values (like 0.025) we get diffusion_length 1.0 and
+ * infinite phase functions. To avoid a sharp discontinuity as we go from
+ * such values to 0.0, increase alpha and reduce the throughput to compensate. */
+ const float min_alpha = 0.2f;
+ if (alpha_x < min_alpha) {
+ (*throughput).x *= alpha_x / min_alpha;
+ alpha_x = min_alpha;
+ }
+ if (alpha_y < min_alpha) {
+ (*throughput).y *= alpha_y / min_alpha;
+ alpha_y = min_alpha;
+ }
+ if (alpha_z < min_alpha) {
+ (*throughput).z *= alpha_z / min_alpha;
+ alpha_z = min_alpha;
+ }
+
+ *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+ *alpha = make_float3(alpha_x, alpha_y, alpha_z);
+}
+
+/* References for Dwivedi sampling:
+ *
+ * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
+ * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
+ * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
+ *
+ * [2] "Improving the Dwivedi Sampling Scheme"
+ * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
+ * https://cg.ivd.kit.edu/1951.php
+ *
+ * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
+ * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
+ * https://iliyan.com/publications/RenderingCourse2020
+ */
+
+ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
+{
+ /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
+ return 1.0f / ((v - cos_theta) * phase_log);
+}
+
+ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
+{
+ /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
+ * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
+ * we can implement the power function like this. */
+ return v - (v + 1.0f) * expf(-rand * phase_log);
+}
+
+ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
+{
+ /* Eq. 67 from [3] */
+ return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
+}
+
+ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
+{
+ float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+ float phi = M_2PI_F * randv;
+ float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
+
+ float3 T, B;
+ make_orthonormals(D, &T, &B);
+ return dir.x * T + dir.y * B + dir.z * D;
+}
+
+ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
+ float t,
+ bool hit,
+ float3 *transmittance)
+{
+ float3 T = volume_color_transmittance(sigma_t, t);
+ if (transmittance) {
+ *transmittance = T;
+ }
+ return hit ? T : sigma_t * T;
+}
+
+/* Define the below variable to get the similarity code active,
+ * and the value represents the cutoff level */
+# define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9
+
+ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS,
+ RNGState rng_state,
+ Ray &ray,
+ LocalIntersection &ss_isect)
+{
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+ const float3 P = INTEGRATOR_STATE(ray, P);
+ const float3 N = INTEGRATOR_STATE(ray, D);
+ const float ray_dP = INTEGRATOR_STATE(ray, dP);
+ const float time = INTEGRATOR_STATE(ray, time);
+ const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+ const int object = INTEGRATOR_STATE(isect, object);
+
+ /* Sample diffuse surface scatter into the object. */
+ float3 D;
+ float pdf;
+ sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+ if (dot(-Ng, D) <= 0.0f) {
+ return false;
+ }
+
+ /* Setup ray. */
+ ray.P = ray_offset(P, -Ng);
+ ray.D = D;
+ ray.t = FLT_MAX;
+ ray.time = time;
+ ray.dP = ray_dP;
+ ray.dD = differential_zero_compact();
+
+# ifndef __KERNEL_OPTIX__
+ /* Compute or fetch object transforms. */
+ Transform ob_itfm ccl_optional_struct_init;
+ Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
+# endif
+
+ /* Convert subsurface to volume coefficients.
+ * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
+ const float3 albedo = INTEGRATOR_STATE(subsurface, albedo);
+ const float3 radius = INTEGRATOR_STATE(subsurface, radius);
+ const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy);
+
+ float3 sigma_t, alpha;
+ float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput);
+ subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
+ float3 sigma_s = sigma_t * alpha;
+
+ /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
+ * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
+ * for making the code significantly more complex and slower (if direction sampling depends on
+ * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
+ *
+ * Since the strength of the guided sampling increases as alpha gets lower, using a value that
+ * is too low results in fireflies while one that's too high just gives a bit more noise.
+ * Therefore, the code here uses the highest of the three albedos to be safe. */
+ const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+
+ if (diffusion_length == 1.0f) {
+ /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
+ * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding
+ * numerical issues and extra unneeded work. */
+ return false;
+ }
+
+ /* Precompute term for phase sampling. */
+ const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
+
+ /* Modify state for RNGs, decorrelated from other paths. */
+ rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+
+ /* Random walk until we hit the surface again. */
+ bool hit = false;
+ bool have_opposite_interface = false;
+ float opposite_distance = 0.0f;
+
+ /* Todo: Disable for alpha>0.999 or so? */
+ /* Our heuristic, a compromise between guiding and classic. */
+ const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
+
+# ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+ float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
+ float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+ float3 sigma_t_org = sigma_t;
+ float3 sigma_s_org = sigma_s;
+ const float anisotropy_org = anisotropy;
+ const float guided_fraction_org = guided_fraction;
+# endif
+
+ for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+ /* Advance random number offset. */
+ rng_state.rng_offset += PRNG_BOUNCE_NUM;
+
+# ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+ // shadow with local variables according to depth
+ float anisotropy, guided_fraction;
+ float3 sigma_s, sigma_t;
+ if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
+ anisotropy = anisotropy_org;
+ guided_fraction = guided_fraction_org;
+ sigma_t = sigma_t_org;
+ sigma_s = sigma_s_org;
+ }
+ else {
+ anisotropy = 0.0f;
+ guided_fraction = 0.75f; // back to isotropic heuristic from Blender
+ sigma_t = sigma_t_star;
+ sigma_s = sigma_s_star;
+ }
+# endif
+
+ /* Sample color channel, use MIS with balance heuristic. */
+ float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
+ float3 channel_pdf;
+ int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
+ float sample_sigma_t = volume_channel_get(sigma_t, channel);
+ float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+
+ /* We need the result of the raycast to compute the full guided PDF, so just remember the
+ * relevant terms to avoid recomputing them later. */
+ float backward_fraction = 0.0f;
+ float forward_pdf_factor = 0.0f;
+ float forward_stretching = 1.0f;
+ float backward_pdf_factor = 0.0f;
+ float backward_stretching = 1.0f;
+
+ /* For the initial ray, we already know the direction, so just do classic distance sampling. */
+ if (bounce > 0) {
+ /* Decide whether we should use guided or classic sampling. */
+ bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+
+ /* Determine if we want to sample away from the incoming interface.
+ * This only happens if we found a nearby opposite interface, and the probability for it
+ * depends on how close we are to it already.
+ * This probability term comes from the recorded presentation of [3]. */
+ bool guide_backward = false;
+ if (have_opposite_interface) {
+ /* Compute distance of the random walk between the tangent plane at the starting point
+ * and the assumed opposite interface (the parallel plane that contains the point we
+ * found in our ray query for the opposite side). */
+ float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
+ backward_fraction = 1.0f /
+ (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
+ guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+ }
+
+ /* Sample scattering direction. */
+ float scatter_u, scatter_v;
+ path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+ float cos_theta;
+ float hg_pdf;
+ if (guided) {
+ cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+ /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
+ * sign here is enough to sample from that instead. */
+ if (guide_backward) {
+ cos_theta = -cos_theta;
+ }
+ float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+ hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
+ ray.D = newD;
+ }
+ else {
+ float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+ cos_theta = dot(newD, N);
+ ray.D = newD;
+ }
+
+ /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
+ * Since phase sampling is channel-independent, we can get away with applying a factor
+ * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
+ * it cancel with an equivalent term in the numerator of the full estimator.
+ * For the backward PDF, we again reuse the same probability distribution with a sign swap.
+ */
+ forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) /
+ hg_pdf;
+ backward_pdf_factor = M_1_2PI_F *
+ eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf;
+
+ /* Prepare distance sampling.
+ * For the backwards case, this also needs the sign swapped since now directions against
+ * sd->N (and therefore with negative cos_theta) are preferred. */
+ forward_stretching = (1.0f - cos_theta / diffusion_length);
+ backward_stretching = (1.0f + cos_theta / diffusion_length);
+ if (guided) {
+ sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
+ }
+ }
+
+ /* Sample direction along ray. */
+ float t = -logf(1.0f - randt) / sample_sigma_t;
+
+ /* On the first bounce, we use the raycast to check if the opposite side is nearby.
+ * If yes, we will later use backwards guided sampling in order to have a decent
+ * chance of connecting to it.
+ * Todo: Maybe use less than 10 times the mean free path? */
+ ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+ scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
+ hit = (ss_isect.num_hits > 0);
+
+ if (hit) {
+# ifdef __KERNEL_OPTIX__
+ /* t is always in world space with OptiX. */
+ ray.t = ss_isect.hits[0].t;
+# else
+ /* Compute world space distance to surface hit. */
+ float3 D = transform_direction(&ob_itfm, ray.D);
+ D = normalize(D) * ss_isect.hits[0].t;
+ ray.t = len(transform_direction(&ob_tfm, D));
+# endif
+ }
+
+ if (bounce == 0) {
+ /* Check if we hit the opposite side. */
+ if (hit) {
+ have_opposite_interface = true;
+ opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+ }
+ /* Apart from the opposite side check, we were supposed to only trace up to distance t,
+ * so check if there would have been a hit in that case. */
+ hit = ray.t < t;
+ }
+
+ /* Use the distance to the exit point for the throughput update if we found one. */
+ if (hit) {
+ t = ray.t;
+ }
+ else if (bounce == 0) {
+ /* Restore original position if nothing was hit after the first bounce,
+ * without the ray_offset() that was added to avoid self-intersection.
+ * Otherwise if that offset is relatively large compared to the scattering
+ * radius, we never go back up high enough to exit the surface. */
+ ray.P = P;
+ }
+
+ /* Advance to new scatter location. */
+ ray.P += t * ray.D;
+
+ float3 transmittance;
+ float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+ if (bounce > 0) {
+ /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
+ float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+
+ if (have_opposite_interface) {
+ /* First step of MIS: Depending on geometry we might have two methods for guided
+ * sampling, so perform MIS between them. */
+ float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+ guided_pdf = mix(
+ guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
+ }
+ else {
+ /* Just include phase sampling factor otherwise. */
+ guided_pdf *= forward_pdf_factor;
+ }
+
+ /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
+ pdf = mix(pdf, guided_pdf, guided_fraction);
+ }
+
+ /* Finally, we're applying MIS again to combine the three color channels.
+ * Altogether, the MIS computation combines up to nine different estimators:
+ * {classic, guided, backward_guided} x {r, g, b} */
+ throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
+
+ if (hit) {
+ /* If we hit the surface, we are done. */
+ break;
+ }
+ else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
+ throughput.y < VOLUME_THROUGHPUT_EPSILON &&
+ throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+ /* Avoid unnecessary work and precision issue when throughput gets really small. */
+ break;
+ }
+ }
+
+ if (hit) {
+ kernel_assert(isfinite3_safe(throughput));
+ INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+ }
+
+ return hit;
+}
+
+ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS)
+{
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+ Ray ray ccl_optional_struct_init;
+ LocalIntersection ss_isect ccl_optional_struct_init;
+
+ if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) {
+ return false;
+ }
+
+# ifdef __VOLUME__
+ /* Update volume stack if needed. */
+ if (kernel_data.integrator.use_volumes) {
+ const int object = intersection_get_object(kg, &ss_isect.hits[0]);
+ const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+ if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
+ float3 P = INTEGRATOR_STATE(ray, P);
+ const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+ const float3 offset_P = ray_offset(P, -Ng);
+
+ integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P);
+ }
+ }
+# endif /* __VOLUME__ */
+
+ /* Pretend ray is coming from the outside towards the exit point. This ensures
+ * correct front/back facing normals.
+ * TODO: find a more elegant solution? */
+ ray.P += ray.D * ray.t * 2.0f;
+ ray.D = -ray.D;
+
+ integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]);
+ integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+ /* Advanced random number offset for bounce. */
+ INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
+
+ const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
+ const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+ if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+ INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+ shader);
+ }
+ else {
+ INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+ shader);
+ }
+
+ return true;
+}
+
+#endif /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
new file mode 100644
index 00000000000..d53070095f0
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Volume Stack
+ *
+ * This is an array of object/shared ID's that the current segment of the path
+ * is inside of. */
+
+template<typename StackReadOp, typename StackWriteOp>
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
+ const ShaderData *sd,
+ StackReadOp stack_read,
+ StackWriteOp stack_write)
+{
+ /* todo: we should have some way for objects to indicate if they want the
+ * world shader to work inside them. excluding it by default is problematic
+ * because non-volume objects can't be assumed to be closed manifolds */
+ if (!(sd->flag & SD_HAS_VOLUME)) {
+ return;
+ }
+
+ if (sd->flag & SD_BACKFACING) {
+ /* Exit volume object: remove from stack. */
+ for (int i = 0;; i++) {
+ VolumeStack entry = stack_read(i);
+ if (entry.shader == SHADER_NONE) {
+ break;
+ }
+
+ if (entry.object == sd->object) {
+ /* Shift back next stack entries. */
+ do {
+ entry = stack_read(i + 1);
+ stack_write(i, entry);
+ i++;
+ } while (entry.shader != SHADER_NONE);
+
+ return;
+ }
+ }
+ }
+ else {
+ /* Enter volume object: add to stack. */
+ int i;
+ for (i = 0;; i++) {
+ VolumeStack entry = stack_read(i);
+ if (entry.shader == SHADER_NONE) {
+ break;
+ }
+
+ /* Already in the stack? then we have nothing to do. */
+ if (entry.object == sd->object) {
+ return;
+ }
+ }
+
+ /* If we exceed the stack limit, ignore. */
+ if (i >= VOLUME_STACK_SIZE - 1) {
+ return;
+ }
+
+ /* Add to the end of the stack. */
+ const VolumeStack new_entry = {sd->object, sd->shader};
+ const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE};
+ stack_write(i, new_entry);
+ stack_write(i + 1, empty_entry);
+ }
+}
+
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+ volume_stack_enter_exit(
+ INTEGRATOR_STATE_PASS,
+ sd,
+ [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); },
+ [=](const int i, const VolumeStack entry) {
+ integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+ });
+}
+
+ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+ volume_stack_enter_exit(
+ INTEGRATOR_STATE_PASS,
+ sd,
+ [=](const int i) {
+ return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+ },
+ [=](const int i, const VolumeStack entry) {
+ integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+ });
+}
+
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS)
+{
+ if (kernel_data.background.volume_shader != SHADER_NONE) {
+ /* Keep the world's volume in stack. */
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
+ }
+ else {
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE;
+ }
+}
+
+template<typename StackReadOp>
+ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read)
+{
+ float step_size = FLT_MAX;
+
+ for (int i = 0;; i++) {
+ VolumeStack entry = stack_read(i);
+ if (entry.shader == SHADER_NONE) {
+ break;
+ }
+
+ int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+ bool heterogeneous = false;
+
+ if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
+ heterogeneous = true;
+ }
+ else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
+ /* We want to render world or objects without any volume grids
+ * as homogeneous, but can only verify this at run-time since other
+ * heterogeneous volume objects may be using the same shader. */
+ int object = entry.object;
+ if (object != OBJECT_NONE) {
+ int object_flag = kernel_tex_fetch(__object_flag, object);
+ if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+ heterogeneous = true;
+ }
+ }
+ }
+
+ if (heterogeneous) {
+ float object_step_size = object_volume_step_size(kg, entry.object);
+ object_step_size *= kernel_data.integrator.volume_step_rate;
+ step_size = fminf(object_step_size, step_size);
+ }
+ }
+
+ return step_size;
+}
+
+typedef enum VolumeSampleMethod {
+ VOLUME_SAMPLE_NONE = 0,
+ VOLUME_SAMPLE_DISTANCE = (1 << 0),
+ VOLUME_SAMPLE_EQUIANGULAR = (1 << 1),
+ VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR),
+} VolumeSampleMethod;
+
+ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS)
+{
+ VolumeSampleMethod method = VOLUME_SAMPLE_NONE;
+
+ for (int i = 0;; i++) {
+ VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+ if (entry.shader == SHADER_NONE) {
+ break;
+ }
+
+ int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+ if (shader_flag & SD_VOLUME_MIS) {
+ /* Multiple importance sampling. */
+ return VOLUME_SAMPLE_MIS;
+ }
+ else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
+ /* Distance + equiangular sampling -> multiple importance sampling. */
+ if (method == VOLUME_SAMPLE_DISTANCE) {
+ return VOLUME_SAMPLE_MIS;
+ }
+
+ /* Only equiangular sampling. */
+ method = VOLUME_SAMPLE_EQUIANGULAR;
+ }
+ else {
+ /* Distance + equiangular sampling -> multiple importance sampling. */
+ if (method == VOLUME_SAMPLE_EQUIANGULAR) {
+ return VOLUME_SAMPLE_MIS;
+ }
+
+ /* Distance sampling only. */
+ method = VOLUME_SAMPLE_DISTANCE;
+ }
+ }
+
+ return method;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 61653d328f1..9e12d24dcf4 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -14,751 +14,501 @@
* limitations under the License.
*/
+#pragma once
+
+#include "kernel_adaptive_sampling.h"
+#include "kernel_random.h"
+#include "kernel_shadow_catcher.h"
+#include "kernel_write_passes.h"
+
CCL_NAMESPACE_BEGIN
-/* BSDF Eval
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
*
- * BSDF evaluation result, split per BSDF type. This is used to accumulate
- * render passes separately. */
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd);
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
-ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
- ClosureType type,
- float3 value,
- int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value)
{
-#ifdef __PASSES__
- eval->use_light_pass = use_light_pass;
-
- if (eval->use_light_pass) {
- eval->diffuse = zero_float3();
- eval->glossy = zero_float3();
- eval->transmission = zero_float3();
- eval->transparent = zero_float3();
- eval->volume = zero_float3();
-
- if (type == CLOSURE_BSDF_TRANSPARENT_ID)
- eval->transparent = value;
- else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
- eval->diffuse = value;
- else if (CLOSURE_IS_BSDF_GLOSSY(type))
- eval->glossy = value;
- else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
- eval->transmission = value;
- else if (CLOSURE_IS_PHASE(type))
- eval->volume = value;
- }
- else
-#endif
- {
+ eval->diffuse = zero_float3();
+ eval->glossy = zero_float3();
+
+ if (is_diffuse) {
eval->diffuse = value;
}
-#ifdef __SHADOW_TRICKS__
- eval->sum_no_mis = zero_float3();
-#endif
+ else {
+ eval->glossy = value;
+ }
}
ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
- ClosureType type,
+ const bool is_diffuse,
float3 value,
float mis_weight)
{
-#ifdef __SHADOW_TRICKS__
- eval->sum_no_mis += value;
-#endif
value *= mis_weight;
-#ifdef __PASSES__
- if (eval->use_light_pass) {
- if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
- eval->diffuse += value;
- else if (CLOSURE_IS_BSDF_GLOSSY(type))
- eval->glossy += value;
- else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
- eval->transmission += value;
- else if (CLOSURE_IS_PHASE(type))
- eval->volume += value;
-
- /* skipping transparent, this function is used by for eval(), will be zero then */
- }
- else
-#endif
- {
- eval->diffuse += value;
- }
-}
-ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
-{
-#ifdef __PASSES__
- if (eval->use_light_pass) {
- return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
- is_zero(eval->transparent) && is_zero(eval->volume);
+ if (is_diffuse) {
+ eval->diffuse += value;
}
- else
-#endif
- {
- return is_zero(eval->diffuse);
+ else {
+ eval->glossy += value;
}
}
-ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
{
-#ifdef __PASSES__
- if (eval->use_light_pass) {
- eval->diffuse *= value;
- eval->glossy *= value;
- eval->transmission *= value;
- eval->volume *= value;
-
- /* skipping transparent, this function is used by for eval(), will be zero then */
- }
- else
-#endif
- {
- eval->diffuse *= value;
- }
+ return is_zero(eval->diffuse) && is_zero(eval->glossy);
}
ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
{
-#ifdef __SHADOW_TRICKS__
- eval->sum_no_mis *= value;
-#endif
- bsdf_eval_mis(eval, value);
+ eval->diffuse *= value;
+ eval->glossy *= value;
}
ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
{
-#ifdef __SHADOW_TRICKS__
- eval->sum_no_mis *= value;
-#endif
-#ifdef __PASSES__
- if (eval->use_light_pass) {
- eval->diffuse *= value;
- eval->glossy *= value;
- eval->transmission *= value;
- eval->volume *= value;
-
- /* skipping transparent, this function is used by for eval(), will be zero then */
- }
- else
- eval->diffuse *= value;
-#else
eval->diffuse *= value;
-#endif
+ eval->glossy *= value;
}
ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
{
-#ifdef __PASSES__
- if (eval->use_light_pass) {
- return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
- }
- else
-#endif
- return eval->diffuse;
+ return eval->diffuse + eval->glossy;
}
-/* Path Radiance
- *
- * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definite directly
- * visible as the first non-transparent hit, while indirectly visible are the
- * bounces after that. */
-
-ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval)
{
- /* clear all */
-#ifdef __PASSES__
- L->use_light_pass = kernel_data.film.use_light_pass;
-
- if (kernel_data.film.use_light_pass) {
- L->indirect = zero_float3();
- L->direct_emission = zero_float3();
-
- L->color_diffuse = zero_float3();
- L->color_glossy = zero_float3();
- L->color_transmission = zero_float3();
-
- L->direct_diffuse = zero_float3();
- L->direct_glossy = zero_float3();
- L->direct_transmission = zero_float3();
- L->direct_volume = zero_float3();
-
- L->indirect_diffuse = zero_float3();
- L->indirect_glossy = zero_float3();
- L->indirect_transmission = zero_float3();
- L->indirect_volume = zero_float3();
-
- L->transparent = 0.0f;
- L->emission = zero_float3();
- L->background = zero_float3();
- L->ao = zero_float3();
- L->shadow = zero_float3();
- L->mist = 0.0f;
-
- L->state.diffuse = zero_float3();
- L->state.glossy = zero_float3();
- L->state.transmission = zero_float3();
- L->state.volume = zero_float3();
- L->state.direct = zero_float3();
- }
- else
-#endif
- {
- L->transparent = 0.0f;
- L->emission = zero_float3();
- }
-
-#ifdef __SHADOW_TRICKS__
- L->path_total = zero_float3();
- L->path_total_shaded = zero_float3();
- L->shadow_background_color = zero_float3();
- L->shadow_throughput = 0.0f;
- L->shadow_transparency = 1.0f;
- L->has_shadow_catcher = 0;
-#endif
-
-#ifdef __DENOISING_FEATURES__
- L->denoising_normal = zero_float3();
- L->denoising_albedo = zero_float3();
- L->denoising_depth = 0.0f;
-#endif
+ /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+ * We assume reflection, transmission and volume scatter to be exclusive. */
+ return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
}
-ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
- PathRadianceState *L_state,
- ccl_addr_space float3 *throughput,
- BsdfEval *bsdf_eval,
- float bsdf_pdf,
- int bounce,
- int bsdf_label)
-{
- float inverse_pdf = 1.0f / bsdf_pdf;
-
-#ifdef __PASSES__
- if (kernel_data.film.use_light_pass) {
- if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
- /* first on directly visible surface */
- float3 value = *throughput * inverse_pdf;
-
- L_state->diffuse = bsdf_eval->diffuse * value;
- L_state->glossy = bsdf_eval->glossy * value;
- L_state->transmission = bsdf_eval->transmission * value;
- L_state->volume = bsdf_eval->volume * value;
-
- *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
- L_state->direct = *throughput;
- }
- else {
- /* transparent bounce before first hit, or indirectly visible through BSDF */
- float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
- *throughput *= sum;
- }
+ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+ if (!isfinite3_safe(*L)) {
+ kernel_assert(!"Cycles sample with non-finite value detected");
}
- else
#endif
- {
- *throughput *= bsdf_eval->diffuse * inverse_pdf;
- }
-}
+ /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+ * sampling convergence check, but also to make it so render result never causes issues with
+ * post-processing. */
+ *L = ensure_finite3(*L);
#ifdef __CLAMP_SAMPLE__
-ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
-{
float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
kernel_data.integrator.sample_clamp_direct;
float sum = reduce_add(fabs(*L));
if (sum > limit) {
*L *= limit / sum;
}
+#endif
}
-ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
- float3 *L,
- float3 *throughput,
- int bounce)
-{
- float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
- kernel_data.integrator.sample_clamp_direct;
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
- float sum = reduce_add(fabs(*L));
- if (sum > limit) {
- float clamp_factor = limit / sum;
- *L *= clamp_factor;
- *throughput *= clamp_factor;
- }
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+ INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+ const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+ kernel_data.film.pass_stride;
+ return render_buffer + render_buffer_offset;
}
-#endif
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
-ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 value)
+ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
+ ccl_global float *ccl_restrict render_buffer,
+ int sample)
{
-#ifdef __SHADOW_TRICKS__
- if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- return;
+ if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+ return sample;
}
-#endif
- float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
- path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+ ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
-#ifdef __PASSES__
- if (L->use_light_pass) {
- if (state->bounce == 0)
- L->emission += contribution;
- else if (state->bounce == 1)
- L->direct_emission += contribution;
- else
- L->indirect += contribution;
- }
- else
-#endif
- {
- L->emission += contribution;
- }
+ return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
}
-ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 alpha,
- float3 bsdf,
- float3 ao)
+ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 contribution,
+ ccl_global float *ccl_restrict buffer)
{
-#ifdef __PASSES__
- /* Store AO pass. */
- if (L->use_light_pass && state->bounce == 0) {
- L->ao += alpha * throughput * ao;
- }
-#endif
-
-#ifdef __SHADOW_TRICKS__
- /* For shadow catcher, accumulate ratio. */
- if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
+ /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+ * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+ * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+ * tiles. */
- if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- return;
- }
+ if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+ return;
}
-#endif
-
- float3 contribution = throughput * bsdf * ao;
-#ifdef __PASSES__
- if (L->use_light_pass) {
- if (state->bounce == 0) {
- /* Directly visible lighting. */
- L->direct_diffuse += contribution;
- }
- else {
- /* Indirectly visible lighting after BSDF bounce. */
- L->indirect += contribution;
- }
- }
- else
-#endif
- {
- L->emission += contribution;
+ const int sample = INTEGRATOR_STATE(path, sample);
+ if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+ kernel_write_pass_float4(
+ buffer + kernel_data.film.pass_adaptive_aux_buffer,
+ make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
}
}
-ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 bsdf)
-{
-#ifdef __SHADOW_TRICKS__
- if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- L->path_total += throughput * bsdf;
- }
-#else
- (void)L;
- (void)state;
- (void)throughput;
- (void)bsdf;
-#endif
-}
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
-ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- BsdfEval *bsdf_eval,
- float3 shadow,
- float shadow_fac,
- bool is_lamp)
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 contribution,
+ ccl_global float *ccl_restrict buffer)
{
-#ifdef __SHADOW_TRICKS__
- if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
-
- if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- return;
- }
+ if (!kernel_data.integrator.has_shadow_catcher) {
+ return false;
}
-#endif
- float3 shaded_throughput = throughput * shadow;
+ kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+ kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-#ifdef __PASSES__
- if (L->use_light_pass) {
- /* Compute the clamping based on the total contribution.
- * The resulting scale is then be applied to all individual components. */
- float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
-# ifdef __CLAMP_SAMPLE__
- path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
-# endif
-
- if (state->bounce == 0) {
- /* directly visible lighting */
- L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
- L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
- L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
- L->direct_volume += shaded_throughput * bsdf_eval->volume;
-
- if (is_lamp) {
- L->shadow += shadow * shadow_fac;
- }
- }
- else {
- /* indirectly visible lighting after BSDF bounce */
- L->indirect += full_contribution;
- }
+ /* Matte pass. */
+ if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+ /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+ * sampling is based on how noisy the combined pass is as if there were no catchers in the
+ * scene. */
}
- else
-#endif
- {
- float3 contribution = shaded_throughput * bsdf_eval->diffuse;
- path_radiance_clamp(kg, &contribution, state->bounce);
- L->emission += contribution;
+
+ /* Shadow catcher pass. */
+ if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+ return true;
}
-}
-ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- const BsdfEval *bsdf_eval)
-{
-#ifdef __SHADOW_TRICKS__
- if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- L->path_total += throughput * bsdf_eval->sum_no_mis;
- }
-#else
- (void)L;
- (void)state;
- (void)throughput;
- (void)bsdf_eval;
-#endif
+ return false;
}
-ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 value)
+ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 contribution,
+ const float transparent,
+ ccl_global float *ccl_restrict buffer)
{
+ if (!kernel_data.integrator.has_shadow_catcher) {
+ return false;
+ }
-#ifdef __SHADOW_TRICKS__
- if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
- L->path_total += throughput * value;
- L->path_total_shaded += throughput * value * L->shadow_transparency;
+ kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+ kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
- if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- return;
- }
+ if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+ return true;
}
-#endif
- float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
- path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+ /* Matte pass. */
+ if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+ kernel_write_pass_float4(
+ buffer + kernel_data.film.pass_shadow_catcher_matte,
+ make_float4(contribution.x, contribution.y, contribution.z, transparent));
+ /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+ * sampling is based on how noisy the combined pass is as if there were no catchers in the
+ * scene. */
+ }
-#ifdef __PASSES__
- if (L->use_light_pass) {
- if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
- L->background += contribution;
- else if (state->bounce == 1)
- L->direct_emission += contribution;
- else
- L->indirect += contribution;
- }
- else
-#endif
- {
- L->emission += contribution;
+ /* Shadow catcher pass. */
+ if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+ /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+ * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+ * pixel of the pass. */
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+ return true;
}
-#ifdef __DENOISING_FEATURES__
- L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
- value;
-#endif /* __DENOISING_FEATURES__ */
+ return false;
}
-ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput)
+ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS,
+ const float transparent,
+ ccl_global float *ccl_restrict buffer)
{
- L->transparent += average(throughput);
-}
+ if (!kernel_data.integrator.has_shadow_catcher) {
+ return;
+ }
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L,
- float3 throughput,
- float3 background)
-{
- L->shadow_throughput += average(throughput);
- L->shadow_background_color += throughput * background;
- L->has_shadow_catcher = 1;
-}
-#endif
+ kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
- /* this division is a bit ugly, but means we only have to keep track of
- * only a single throughput further along the path, here we recover just
- * the indirect path that is not influenced by any particular BSDF type */
- if (L->use_light_pass) {
- L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
- L->direct_diffuse += L->state.diffuse * L->direct_emission;
- L->direct_glossy += L->state.glossy * L->direct_emission;
- L->direct_transmission += L->state.transmission * L->direct_emission;
- L->direct_volume += L->state.volume * L->direct_emission;
-
- L->indirect = safe_divide_color(L->indirect, L->state.direct);
- L->indirect_diffuse += L->state.diffuse * L->indirect;
- L->indirect_glossy += L->state.glossy * L->indirect;
- L->indirect_transmission += L->state.transmission * L->indirect;
- L->indirect_volume += L->state.volume * L->indirect;
+ /* Matte pass. */
+ if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
}
-#endif
}
-ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
- if (L->use_light_pass) {
- L->state.diffuse = zero_float3();
- L->state.glossy = zero_float3();
- L->state.transmission = zero_float3();
- L->state.volume = zero_float3();
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
- L->direct_emission = zero_float3();
- L->indirect = zero_float3();
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 contribution,
+ ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+ if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) {
+ return;
}
#endif
+
+ if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+ }
+
+ kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
}
-ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src)
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 contribution,
+ const float transparent,
+ ccl_global float *ccl_restrict
+ buffer)
{
-#ifdef __PASSES__
- if (L->use_light_pass) {
- L->state = L_src->state;
-
- L->direct_emission = L_src->direct_emission;
- L->indirect = L_src->indirect;
+#ifdef __SHADOW_CATCHER__
+ if (kernel_accum_shadow_catcher_transparent(
+ INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) {
+ return;
}
#endif
+
+ if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+ kernel_write_pass_float4(
+ buffer + kernel_data.film.pass_combined,
+ make_float4(contribution.x, contribution.y, contribution.z, transparent));
+ }
+
+ kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
}
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
- PathRadiance *L,
- float3 *L_sum,
- float *alpha)
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS,
+ float3 contribution,
+ ccl_global float *ccl_restrict
+ buffer,
+ const int pass)
{
- /* Calculate current shadow of the path. */
- float path_total = average(L->path_total);
- float shadow;
+ if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+ return;
+ }
- if (UNLIKELY(!isfinite_safe(path_total))) {
-# ifdef __KERNEL_DEBUG_NAN__
- kernel_assert(!"Non-finite total radiance along the path");
-# endif
- shadow = 0.0f;
+#ifdef __PASSES__
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+ int pass_offset = PASS_UNUSED;
+
+ /* Denoising albedo. */
+# ifdef __DENOISING_FEATURES__
+ if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+ if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+ const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+ denoising_feature_throughput);
+ const float3 denoising_albedo = denoising_feature_throughput * contribution;
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+ }
}
- else if (path_total == 0.0f) {
- shadow = L->shadow_transparency;
+# endif /* __DENOISING_FEATURES__ */
+
+ if (!(path_flag & PATH_RAY_ANY_PASS)) {
+ /* Directly visible, write to emission or background pass. */
+ pass_offset = pass;
+ }
+ else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+ /* Indirectly visible through reflection. */
+ const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+ ((INTEGRATOR_STATE(path, bounce) == 1) ?
+ kernel_data.film.pass_glossy_direct :
+ kernel_data.film.pass_glossy_indirect) :
+ ((INTEGRATOR_STATE(path, bounce) == 1) ?
+ kernel_data.film.pass_transmission_direct :
+ kernel_data.film.pass_transmission_indirect);
+
+ if (glossy_pass_offset != PASS_UNUSED) {
+ /* Glossy is a subset of the throughput, reconstruct it here using the
+ * diffuse-glossy ratio. */
+ const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+ kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+ }
+
+ /* Reconstruct diffuse subset of throughput. */
+ pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct :
+ kernel_data.film.pass_diffuse_indirect;
+ if (pass_offset != PASS_UNUSED) {
+ contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ }
}
- else {
- float path_total_shaded = average(L->path_total_shaded);
- shadow = path_total_shaded / path_total;
+ else if (path_flag & PATH_RAY_VOLUME_PASS) {
+ /* Indirectly visible through volume. */
+ pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct :
+ kernel_data.film.pass_volume_indirect;
}
- /* Calculate final light sum and transparency for shadow catcher object. */
- if (kernel_data.background.transparent) {
- *alpha -= L->shadow_throughput * shadow;
- }
- else {
- L->shadow_background_color *= shadow;
- *L_sum += L->shadow_background_color;
+ /* Single write call for GPU coherence. */
+ if (pass_offset != PASS_UNUSED) {
+ kernel_write_pass_float3(buffer + pass_offset, contribution);
}
+#endif /* __PASSES__ */
}
-#endif
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
- PathRadiance *L,
- float *alpha)
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
+ ccl_global float *ccl_restrict render_buffer)
{
- float3 L_sum;
- /* Light Passes are used */
+ /* The throughput for shadow paths already contains the light shader evaluation. */
+ float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
+ kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+
+ ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
+
+ kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+
#ifdef __PASSES__
- float3 L_direct, L_indirect;
- if (L->use_light_pass) {
- path_radiance_sum_indirect(L);
-
- L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
- L->emission;
- L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
- L->indirect_volume;
-
- if (!kernel_data.background.transparent)
- L_direct += L->background;
-
- L_sum = L_direct + L_indirect;
- float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-
- /* Reject invalid value */
- if (!isfinite_safe(sum)) {
-# ifdef __KERNEL_DEBUG_NAN__
- kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
-# endif
- L_sum = zero_float3();
-
- L->direct_diffuse = zero_float3();
- L->direct_glossy = zero_float3();
- L->direct_transmission = zero_float3();
- L->direct_volume = zero_float3();
-
- L->indirect_diffuse = zero_float3();
- L->indirect_glossy = zero_float3();
- L->indirect_transmission = zero_float3();
- L->indirect_volume = zero_float3();
-
- L->emission = zero_float3();
+ if (kernel_data.film.light_pass_flag & PASS_ANY) {
+ const int path_flag = INTEGRATOR_STATE(shadow_path, flag);
+ int pass_offset = PASS_UNUSED;
+
+ if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+ /* Indirectly visible through reflection. */
+ const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+ ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+ kernel_data.film.pass_glossy_direct :
+ kernel_data.film.pass_glossy_indirect) :
+ ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+ kernel_data.film.pass_transmission_direct :
+ kernel_data.film.pass_transmission_indirect);
+
+ if (glossy_pass_offset != PASS_UNUSED) {
+ /* Glossy is a subset of the throughput, reconstruct it here using the
+ * diffuse-glossy ratio. */
+ const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+ const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+ kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+ }
+
+ /* Reconstruct diffuse subset of throughput. */
+ pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+ kernel_data.film.pass_diffuse_direct :
+ kernel_data.film.pass_diffuse_indirect;
+ if (pass_offset != PASS_UNUSED) {
+ contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+ }
+ }
+ else if (path_flag & PATH_RAY_VOLUME_PASS) {
+ /* Indirectly visible through volume. */
+ pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+ kernel_data.film.pass_volume_direct :
+ kernel_data.film.pass_volume_indirect;
}
- }
- /* No Light Passes */
- else
-#endif
- {
- L_sum = L->emission;
+ /* Single write call for GPU coherence. */
+ if (pass_offset != PASS_UNUSED) {
+ kernel_write_pass_float3(buffer + pass_offset, contribution);
+ }
- /* Reject invalid value */
- float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
- if (!isfinite_safe(sum)) {
-#ifdef __KERNEL_DEBUG_NAN__
- kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-#endif
- L_sum = zero_float3();
+ /* Write shadow pass. */
+ if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+ (path_flag & PATH_RAY_CAMERA)) {
+ const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput);
+ const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput);
+ const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+ kernel_data.film.pass_shadow_scale;
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
}
}
+#endif
+}
- /* Compute alpha. */
- *alpha = 1.0f - L->transparent;
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS,
+ const float transparent,
+ ccl_global float *ccl_restrict render_buffer)
+{
+ ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
- /* Add shadow catcher contributions. */
-#ifdef __SHADOW_TRICKS__
- if (L->has_shadow_catcher) {
- path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
+ if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+ kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
}
-#endif /* __SHADOW_TRICKS__ */
- return L_sum;
+ kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer);
}
-ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
- PathRadiance *L,
- float3 *noisy,
- float3 *clean)
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 L,
+ const float transparent,
+ const bool is_transparent_background_ray,
+ ccl_global float *ccl_restrict render_buffer)
{
-#ifdef __PASSES__
- kernel_assert(L->use_light_pass);
-
- *clean = L->emission + L->background;
- *noisy = L->direct_volume + L->indirect_volume;
-
-# define ADD_COMPONENT(flag, component) \
- if (kernel_data.film.denoising_flags & flag) \
- *clean += component; \
- else \
- *noisy += component;
-
- ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
- ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
- ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
- ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
- ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
- ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-# undef ADD_COMPONENT
-#else
- *noisy = L->emission;
- *clean = zero_float3();
-#endif
+ float3 contribution = INTEGRATOR_STATE(path, throughput) * L;
+ kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
-#ifdef __SHADOW_TRICKS__
- if (L->has_shadow_catcher) {
- *noisy += L->shadow_background_color;
- }
-#endif
+ ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
- *noisy = ensure_finite3(*noisy);
- *clean = ensure_finite3(*clean);
+ if (is_transparent_background_ray) {
+ kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+ }
+ else {
+ kernel_accum_combined_transparent_pass(
+ INTEGRATOR_STATE_PASS, contribution, transparent, buffer);
+ }
+ kernel_accum_emission_or_background_pass(
+ INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background);
}
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS,
+ const float3 throughput,
+ const float3 L,
+ ccl_global float *ccl_restrict render_buffer)
{
-#ifdef __SPLIT_KERNEL__
-# define safe_float3_add(f, v) \
- do { \
- ccl_global float *p = (ccl_global float *)(&(f)); \
- atomic_add_and_fetch_float(p + 0, (v).x); \
- atomic_add_and_fetch_float(p + 1, (v).y); \
- atomic_add_and_fetch_float(p + 2, (v).z); \
- } while (0)
-# define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v))
-#else
-# define safe_float3_add(f, v) (f) += (v)
-# define safe_float_add(f, v) (f) += (v)
-#endif /* __SPLIT_KERNEL__ */
+ float3 contribution = throughput * L;
+ kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
-#ifdef __PASSES__
- safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
- safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
- safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
- safe_float3_add(L->direct_volume, L_sample->direct_volume);
-
- safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
- safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
- safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
- safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
-
- safe_float3_add(L->background, L_sample->background);
- safe_float3_add(L->ao, L_sample->ao);
- safe_float3_add(L->shadow, L_sample->shadow);
- safe_float_add(L->mist, L_sample->mist);
-#endif /* __PASSES__ */
- safe_float3_add(L->emission, L_sample->emission);
+ ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+ render_buffer);
-#undef safe_float_add
-#undef safe_float3_add
+ kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+ kernel_accum_emission_or_background_pass(
+ INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..7d71907effe 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
* limitations under the License.
*/
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
CCL_NAMESPACE_BEGIN
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
- ccl_global float *buffer,
- int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+ ccl_global float *render_buffer)
{
- /* TODO Stefan: Is this better in linear, sRGB or something else? */
- float4 I = *((ccl_global float4 *)buffer);
- float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
- /* The per pixel error as seen in section 2.1 of
- * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
- * A small epsilon is added to the divisor to prevent division by zero. */
- float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
- (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
- if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
- /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
- buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+ if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+ return true;
}
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
- ccl_global float *buffer,
- float sample_multiplier)
-{
- *(ccl_global float4 *)(buffer) *= sample_multiplier;
- /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
- kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
- *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+ const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+ kernel_data.film.pass_stride;
+ ccl_global float *buffer = render_buffer + render_buffer_offset;
-#ifdef __PASSES__
- int flag = kernel_data.film.pass_flag;
-
- if (flag & PASSMASK(NORMAL))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+ const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+ return buffer[aux_w_offset] == 0.0f;
+}
- if (flag & PASSMASK(UV))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
- if (flag & PASSMASK(MOTION)) {
- *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
- *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)
+{
+ kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+ kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+ const int render_pixel_index = offset + x + y * stride;
+ ccl_global float *buffer = render_buffer +
+ (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+ /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+ const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+ if (!reset && A.w != 0.0f) {
+ /* If the pixel was considered converged, its state will not change in this kernel. Early
+ * output before doing any math.
+ *
+ * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+ return true;
}
- if (kernel_data.film.use_light_pass) {
- int light_flag = kernel_data.film.light_pass_flag;
-
- if (light_flag & PASSMASK(MIST))
- *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
- /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
- if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
- if (light_flag & PASSMASK(GLOSSY_INDIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
- if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
- *(ccl_global float3 *)(buffer +
- kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
- if (light_flag & PASSMASK(VOLUME_INDIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
- if (light_flag & PASSMASK(DIFFUSE_DIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
- if (light_flag & PASSMASK(GLOSSY_DIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
- if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
- *(ccl_global float3 *)(buffer +
- kernel_data.film.pass_transmission_direct) *= sample_multiplier;
- if (light_flag & PASSMASK(VOLUME_DIRECT))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
- if (light_flag & PASSMASK(EMISSION))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
- if (light_flag & PASSMASK(BACKGROUND))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
- if (light_flag & PASSMASK(AO))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
- if (light_flag & PASSMASK(DIFFUSE_COLOR))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
- if (light_flag & PASSMASK(GLOSSY_COLOR))
- *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
- if (light_flag & PASSMASK(TRANSMISSION_COLOR))
- *(ccl_global float3 *)(buffer +
- kernel_data.film.pass_transmission_color) *= sample_multiplier;
- }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-# define scale_float3_variance(buffer, offset, scale) \
- *(buffer + offset) *= scale; \
- *(buffer + offset + 1) *= scale; \
- *(buffer + offset + 2) *= scale; \
- *(buffer + offset + 3) *= scale * scale; \
- *(buffer + offset + 4) *= scale * scale; \
- *(buffer + offset + 5) *= scale * scale;
-
-# define scale_shadow_variance(buffer, offset, scale) \
- *(buffer + offset) *= scale; \
- *(buffer + offset + 1) *= scale; \
- *(buffer + offset + 2) *= scale * scale;
-
- if (kernel_data.film.pass_denoising_data) {
- scale_shadow_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
- scale_shadow_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
- if (kernel_data.film.pass_denoising_clean) {
- scale_float3_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
- *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
- *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
- *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
- }
- else {
- scale_float3_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
- }
- scale_float3_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
- scale_float3_variance(
- buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
- *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
- *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
- 1) *= sample_multiplier * sample_multiplier;
- }
-#endif /* __DENOISING_FEATURES__ */
-
- /* Cryptomatte. */
- if (kernel_data.film.cryptomatte_passes) {
- int num_slots = 0;
- num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
- num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
- num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
- num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
- ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
- kernel_data.film.pass_cryptomatte);
- for (int slot = 0; slot < num_slots; slot++) {
- id_buffer[slot].y *= sample_multiplier;
- }
- }
+ const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
- /* AOVs. */
- for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
- *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
- }
- for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
- *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
- }
+ const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+ const float inv_sample = 1.0f / sample;
+
+ /* The per pixel error as seen in section 2.1 of
+ * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+ const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+ inv_sample;
+ const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+ /* A small epsilon is added to the divisor to prevent division by zero. */
+ const float error = error_difference / (0.0001f + error_normalize);
+ const bool did_converge = (error < threshold);
+
+ const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+ buffer[aux_w_offset] = did_converge;
+
+ return did_converge;
}
/* This is a simple box filter in two passes.
* When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)
{
- bool any = false;
+ kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
bool prev = false;
- for (int x = tile->x; x < tile->x + tile->w; ++x) {
- int index = tile->offset + x + y * tile->stride;
- ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w == 0.0f) {
- any = true;
- if (x > tile->x && !prev) {
+ for (int x = start_x; x < start_x + width; ++x) {
+ int index = offset + x + y * stride;
+ ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+ const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+ if (buffer[aux_w_offset] == 0.0f) {
+ if (x > start_x && !prev) {
index = index - 1;
- buffer = tile->buffer + index * kernel_data.film.pass_stride;
- aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
- (*aux).w = 0.0f;
+ buffer = render_buffer + index * kernel_data.film.pass_stride;
+ buffer[aux_w_offset] = 0.0f;
}
prev = true;
}
else {
if (prev) {
- (*aux).w = 0.0f;
+ buffer[aux_w_offset] = 0.0f;
}
prev = false;
}
}
- return any;
}
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)
{
+ kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
bool prev = false;
- bool any = false;
- for (int y = tile->y; y < tile->y + tile->h; ++y) {
- int index = tile->offset + x + y * tile->stride;
- ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w == 0.0f) {
- any = true;
- if (y > tile->y && !prev) {
- index = index - tile->stride;
- buffer = tile->buffer + index * kernel_data.film.pass_stride;
- aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
- (*aux).w = 0.0f;
+ for (int y = start_y; y < start_y + height; ++y) {
+ int index = offset + x + y * stride;
+ ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+ const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+ if (buffer[aux_w_offset] == 0.0f) {
+ if (y > start_y && !prev) {
+ index = index - stride;
+ buffer = render_buffer + index * kernel_data.film.pass_stride;
+ buffer[aux_w_offset] = 0.0f;
}
prev = true;
}
else {
if (prev) {
- (*aux).w = 0.0f;
+ buffer[aux_w_offset] = 0.0f;
}
prev = false;
}
}
- return any;
}
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 7da890b908d..e025bcd6674 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -14,502 +14,62 @@
* limitations under the License.
*/
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BAKING__
-
-ccl_device_noinline void compute_light_pass(
- KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
-{
- kernel_assert(kernel_data.film.use_light_pass);
-
- float3 throughput = one_float3();
-
- /* Emission and indirect shader data memory used by various functions. */
- ShaderDataTinyStorage emission_sd_storage;
- ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
- ShaderData indirect_sd;
-
- /* Init radiance. */
- path_radiance_init(kg, L);
-
- /* Init path state. */
- PathState state;
- path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
-
- /* Evaluate surface shader. */
- shader_eval_surface(kg, sd, &state, NULL, state.flag);
-
- /* TODO: disable more closures we don't need besides transparent. */
- shader_bsdf_disable_transparency(kg, sd);
-
- /* Init ray. */
- Ray ray;
- ray.P = sd->P + sd->Ng;
- ray.D = -sd->Ng;
- ray.t = FLT_MAX;
-# ifdef __CAMERA_MOTION__
- ray.time = 0.5f;
-# endif
-
-# ifdef __BRANCHED_PATH__
- if (!kernel_data.integrator.branched) {
- /* regular path tracer */
-# endif
-
- /* sample ambient occlusion */
- if (pass_filter & BAKE_FILTER_AO) {
- kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
- }
-
- /* sample emission */
- if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
- float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
- path_radiance_accum_emission(kg, L, &state, throughput, emission);
- }
-
- bool is_sss_sample = false;
-
-# ifdef __SUBSURFACE__
- /* sample subsurface scattering */
- if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
- /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
- * if scattering was successful. */
- SubsurfaceIndirectRays ss_indirect;
- kernel_path_subsurface_init_indirect(&ss_indirect);
- if (kernel_path_subsurface_scatter(
- kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
- while (ss_indirect.num_rays) {
- kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
- kernel_path_indirect(
- kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
- }
- is_sss_sample = true;
- }
- }
-# endif
-
- /* sample light and BSDF */
- if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
- kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
-
- if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
-# ifdef __LAMP_MIS__
- state.ray_t = 0.0f;
-# endif
- /* compute indirect light */
- kernel_path_indirect(
- kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-
- /* sum and reset indirect light pass variables for the next samples */
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
- }
- }
-# ifdef __BRANCHED_PATH__
- }
- else {
- /* branched path tracer */
-
- /* sample ambient occlusion */
- if (pass_filter & BAKE_FILTER_AO) {
- kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
- }
-
- /* sample emission */
- if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
- float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
- path_radiance_accum_emission(kg, L, &state, throughput, emission);
- }
-
-# ifdef __SUBSURFACE__
- /* sample subsurface scattering */
- if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
- /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
- * if scattering was successful. */
- kernel_branched_path_subsurface_scatter(
- kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
- }
-# endif
-
- /* sample light and BSDF */
- if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) {
-# if defined(__EMISSION__)
- /* direct light */
- if (kernel_data.integrator.use_direct_light) {
- int all = kernel_data.integrator.sample_all_lights_direct;
- kernel_branched_path_surface_connect_light(
- kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
- }
-# endif
-
- /* indirect light */
- kernel_branched_path_surface_indirect_light(
- kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
- }
- }
-# endif
-}
-
-/* this helps with AA but it's not the real solution as it does not AA the geometry
- * but it's better than nothing, thus committed */
-ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
-{
- /* use mirror repeat (like opengl texture) so that if the barycentric
- * coordinate goes past the end of the triangle it is not always clamped
- * to the same value, gives ugly patterns */
- u /= max;
- float fu = floorf(u);
- u = u - fu;
-
- return ((((int)fu) & 1) ? 1.0f - u : u) * max;
-}
-
-ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
- ShaderData *sd,
- const ShaderEvalType type)
-{
- switch (type) {
- case SHADER_EVAL_DIFFUSE:
- return shader_bsdf_diffuse(kg, sd);
- case SHADER_EVAL_GLOSSY:
- return shader_bsdf_glossy(kg, sd);
- case SHADER_EVAL_TRANSMISSION:
- return shader_bsdf_transmission(kg, sd);
- default:
- kernel_assert(!"Unknown bake type passed to BSDF evaluate");
- return zero_float3();
- }
-}
-
-ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
- ShaderData *sd,
- PathState *state,
- float3 direct,
- float3 indirect,
- const ShaderEvalType type,
- const int pass_filter)
-{
- float3 color;
- const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0;
- const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0;
- const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0;
- float3 out = zero_float3();
-
- if (is_color) {
- if (is_direct || is_indirect) {
- /* Leave direct and diffuse channel colored. */
- color = one_float3();
- }
- else {
- /* surface color of the pass only */
- shader_eval_surface(kg, sd, state, NULL, 0);
- return kernel_bake_shader_bsdf(kg, sd, type);
- }
- }
- else {
- shader_eval_surface(kg, sd, state, NULL, 0);
- color = kernel_bake_shader_bsdf(kg, sd, type);
- }
-
- if (is_direct) {
- out += safe_divide_even_color(direct, color);
- }
-
- if (is_indirect) {
- out += safe_divide_even_color(indirect, color);
- }
-
- return out;
-}
-
-ccl_device void kernel_bake_evaluate(
- KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
- /* Setup render buffers. */
- const int index = offset + x + y * stride;
- const int pass_stride = kernel_data.film.pass_stride;
- buffer += index * pass_stride;
-
- ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
- ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
- ccl_global float *output = buffer + kernel_data.film.pass_combined;
-
- int seed = __float_as_uint(primitive[0]);
- int prim = __float_as_uint(primitive[1]);
- if (prim == -1)
- return;
-
- prim += kernel_data.bake.tri_offset;
-
- /* Random number generator. */
- uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
- int num_samples = kernel_data.integrator.aa_samples;
-
- float filter_x, filter_y;
- if (sample == 0) {
- filter_x = filter_y = 0.5f;
- }
- else {
- path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
- }
-
- /* Barycentric UV with sub-pixel offset. */
- float u = primitive[2];
- float v = primitive[3];
-
- float dudx = differential[0];
- float dudy = differential[1];
- float dvdx = differential[2];
- float dvdy = differential[3];
-
- if (sample > 0) {
- u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
- v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
- 1.0f - u);
- }
-
- /* Shader data setup. */
- int object = kernel_data.bake.object_index;
- int shader;
- float3 P, Ng;
-
- triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
- ShaderData sd;
- shader_setup_from_sample(
- kg,
- &sd,
- P,
- Ng,
- Ng,
- shader,
- object,
- prim,
- u,
- v,
- 1.0f,
- 0.5f,
- !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
- LAMP_NONE);
- sd.I = sd.N;
-
- /* Setup differentials. */
- sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
- sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
- sd.du.dx = dudx;
- sd.du.dy = dudy;
- sd.dv.dx = dvdx;
- sd.dv.dy = dvdy;
-
- /* Set RNG state for shaders that use sampling. */
- PathState state = {0};
- state.rng_hash = rng_hash;
- state.rng_offset = 0;
- state.sample = sample;
- state.num_samples = num_samples;
- state.min_ray_pdf = FLT_MAX;
-
- /* Light passes if we need more than color. */
- PathRadiance L;
- int pass_filter = kernel_data.bake.pass_filter;
-
- if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
- compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
-
- float3 out = zero_float3();
-
- ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
- switch (type) {
- /* data passes */
- case SHADER_EVAL_NORMAL:
- case SHADER_EVAL_ROUGHNESS:
- case SHADER_EVAL_EMISSION: {
- if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
- int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
- shader_eval_surface(kg, &sd, &state, NULL, path_flag);
- }
-
- if (type == SHADER_EVAL_NORMAL) {
- float3 N = sd.N;
- if (sd.flag & SD_HAS_BUMP) {
- N = shader_bsdf_average_normal(kg, &sd);
- }
+#pragma once
- /* encoding: normal = (2 * color) - 1 */
- out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
- }
- else if (type == SHADER_EVAL_ROUGHNESS) {
- float roughness = shader_bsdf_average_roughness(&sd);
- out = make_float3(roughness, roughness, roughness);
- }
- else {
- out = shader_emissive_eval(&sd);
- }
- break;
- }
- case SHADER_EVAL_UV: {
- out = primitive_uv(kg, &sd);
- break;
- }
-# ifdef __PASSES__
- /* light passes */
- case SHADER_EVAL_AO: {
- out = L.ao;
- break;
- }
- case SHADER_EVAL_COMBINED: {
- if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
- float alpha;
- out = path_radiance_clamp_and_sum(kg, &L, &alpha);
- break;
- }
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
- if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT)
- out += L.direct_diffuse;
- if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT)
- out += L.indirect_diffuse;
+#include "kernel/geom/geom.h"
- if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT)
- out += L.direct_glossy;
- if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT)
- out += L.indirect_glossy;
-
- if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT)
- out += L.direct_transmission;
- if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
- out += L.indirect_transmission;
-
- if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
- out += L.emission;
-
- break;
- }
- case SHADER_EVAL_SHADOW: {
- out = L.shadow;
- break;
- }
- case SHADER_EVAL_DIFFUSE: {
- out = kernel_bake_evaluate_direct_indirect(
- kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter);
- break;
- }
- case SHADER_EVAL_GLOSSY: {
- out = kernel_bake_evaluate_direct_indirect(
- kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter);
- break;
- }
- case SHADER_EVAL_TRANSMISSION: {
- out = kernel_bake_evaluate_direct_indirect(
- kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
- break;
- }
-# endif
-
- /* extra */
- case SHADER_EVAL_ENVIRONMENT: {
- /* setup ray */
- Ray ray;
-
- ray.P = zero_float3();
- ray.D = normalize(P);
- ray.t = 0.0f;
-# ifdef __CAMERA_MOTION__
- ray.time = 0.5f;
-# endif
-
-# ifdef __RAY_DIFFERENTIALS__
- ray.dD = differential3_zero();
- ray.dP = differential3_zero();
-# endif
-
- /* setup shader data */
- shader_setup_from_background(kg, &sd, &ray);
-
- /* evaluate */
- int path_flag = 0; /* we can't know which type of BSDF this is for */
- shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
- out = shader_background_eval(&sd);
- break;
- }
- default: {
- /* no real shader, returning the position of the verts for debugging */
- out = normalize(P);
- break;
- }
- }
-
- /* write output */
- const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
- kernel_write_pass_float4(output, result);
-}
-
-#endif /* __BAKING__ */
+CCL_NAMESPACE_BEGIN
-ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
- ccl_global uint4 *input,
+ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
+ ccl_global const KernelShaderEvalInput *input,
ccl_global float4 *output,
- int i)
+ const int offset)
{
- ShaderData sd;
- PathState state = {0};
- uint4 in = input[i];
+ /* Setup shader data. */
+ const KernelShaderEvalInput in = input[offset];
- /* setup shader data */
- int object = in.x;
- int prim = in.y;
- float u = __uint_as_float(in.z);
- float v = __uint_as_float(in.w);
-
- shader_setup_from_displace(kg, &sd, object, prim, u, v);
+ ShaderData sd;
+ shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v);
- /* evaluate */
- float3 P = sd.P;
- shader_eval_displacement(kg, &sd, &state);
+ /* Evaluate displacement shader. */
+ const float3 P = sd.P;
+ shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd);
float3 D = sd.P - P;
object_inverse_dir_transform(kg, &sd, &D);
- /* write output */
- output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+ /* Write output. */
+ output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
}
-ccl_device void kernel_background_evaluate(KernelGlobals *kg,
- ccl_global uint4 *input,
+ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
+ ccl_global const KernelShaderEvalInput *input,
ccl_global float4 *output,
- int i)
+ const int offset)
{
- ShaderData sd;
- PathState state = {0};
- uint4 in = input[i];
-
- /* setup ray */
- Ray ray;
- float u = __uint_as_float(in.x);
- float v = __uint_as_float(in.y);
-
- ray.P = zero_float3();
- ray.D = equirectangular_to_direction(u, v);
- ray.t = 0.0f;
-#ifdef __CAMERA_MOTION__
- ray.time = 0.5f;
-#endif
+ /* Setup ray */
+ const KernelShaderEvalInput in = input[offset];
+ const float3 ray_P = zero_float3();
+ const float3 ray_D = equirectangular_to_direction(in.u, in.v);
+ const float ray_time = 0.5f;
-#ifdef __RAY_DIFFERENTIALS__
- ray.dD = differential3_zero();
- ray.dP = differential3_zero();
-#endif
-
- /* setup shader data */
- shader_setup_from_background(kg, &sd, &ray);
+ /* Setup shader data. */
+ ShaderData sd;
+ shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time);
- /* evaluate */
- int path_flag = 0; /* we can't know which type of BSDF this is for */
- shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
- float3 color = shader_background_eval(&sd);
+ /* Evaluate shader.
+ * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
+ const int path_flag = PATH_RAY_EMISSION;
+ shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+ INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
+ const float3 color = shader_background_eval(&sd);
- /* write output */
- output[i] += make_float4(color.x, color.y, color.z, 0.0f);
+ /* Write output. */
+ output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1bfac37158d..7be5da8fe6d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -14,6 +14,13 @@
* limitations under the License.
*/
+#pragma once
+
+#include "kernel_differential.h"
+#include "kernel_lookup_table.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+
CCL_NAMESPACE_BEGIN
/* Perspective Camera */
@@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u
return bokeh;
}
-ccl_device void camera_sample_perspective(KernelGlobals *kg,
+ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg,
float raster_x,
float raster_y,
float lens_u,
@@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
#ifdef __RAY_DIFFERENTIALS__
float3 Dcenter = transform_direction(&cameratoworld, Pcamera);
-
- ray->dP = differential3_zero();
- ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter);
- ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter);
+ float3 Dcenter_normalized = normalize(Dcenter);
+
+ /* TODO: can this be optimized to give compact differentials directly? */
+ ray->dP = differential_zero_compact();
+ differential3 dD;
+ dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized;
+ dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized;
+ ray->dD = differential_make_compact(dD);
#endif
}
else {
@@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
Dx = normalize(transform_direction(&cameratoworld, Dx));
spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
- ray->dP.dx = Px - Pcenter;
- ray->dD.dx = Dx - Dcenter;
+ differential3 dP, dD;
+
+ dP.dx = Px - Pcenter;
+ dD.dx = Dx - Dcenter;
float3 Py = Pnostereo;
float3 Dy = transform_perspective(&rastertocamera,
@@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
Dy = normalize(transform_direction(&cameratoworld, Dy));
spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
- ray->dP.dy = Py - Pcenter;
- ray->dD.dy = Dy - Dcenter;
+ dP.dy = Py - Pcenter;
+ dD.dy = Dy - Dcenter;
+ ray->dD = differential_make_compact(dD);
+ ray->dP = differential_make_compact(dP);
#endif
}
@@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
float z_inv = 1.0f / normalize(Pcamera).z;
float nearclip = kernel_data.cam.nearclip * z_inv;
ray->P += nearclip * ray->D;
- ray->dP.dx += nearclip * ray->dD.dx;
- ray->dP.dy += nearclip * ray->dD.dy;
+ ray->dP += nearclip * ray->dD;
ray->t = kernel_data.cam.cliplength * z_inv;
#else
ray->t = FLT_MAX;
@@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
}
/* Orthographic Camera */
-ccl_device void camera_sample_orthographic(KernelGlobals *kg,
+ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg,
float raster_x,
float raster_y,
float lens_u,
@@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
#ifdef __RAY_DIFFERENTIALS__
/* ray differential */
- ray->dP.dx = float4_to_float3(kernel_data.cam.dx);
- ray->dP.dy = float4_to_float3(kernel_data.cam.dy);
+ differential3 dP;
+ dP.dx = float4_to_float3(kernel_data.cam.dx);
+ dP.dy = float4_to_float3(kernel_data.cam.dx);
- ray->dD = differential3_zero();
+ ray->dP = differential_make_compact(dP);
+ ray->dD = differential_zero_compact();
#endif
#ifdef __CAMERA_CLIPPING__
@@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
spherical_stereo_transform(cam, &Px, &Dx);
}
- ray->dP.dx = Px - Pcenter;
- ray->dD.dx = Dx - Dcenter;
+ differential3 dP, dD;
+ dP.dx = Px - Pcenter;
+ dD.dx = Dx - Dcenter;
float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
@@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
spherical_stereo_transform(cam, &Py, &Dy);
}
- ray->dP.dy = Py - Pcenter;
- ray->dD.dy = Dy - Dcenter;
+ dP.dy = Py - Pcenter;
+ dD.dy = Dy - Dcenter;
+ ray->dD = differential_make_compact(dD);
+ ray->dP = differential_make_compact(dP);
#endif
#ifdef __CAMERA_CLIPPING__
/* clipping */
float nearclip = cam->nearclip;
ray->P += nearclip * ray->D;
- ray->dP.dx += nearclip * ray->dD.dx;
- ray->dP.dy += nearclip * ray->dD.dy;
+ ray->dP += nearclip * ray->dD;
ray->t = cam->cliplength;
#else
ray->t = FLT_MAX;
@@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
/* Common */
-ccl_device_inline void camera_sample(KernelGlobals *kg,
+ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg,
int x,
int y,
float filter_u,
@@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
/* Utilities */
-ccl_device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(const KernelGlobals *kg)
{
Transform cameratoworld = kernel_data.cam.cameratoworld;
return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
}
-ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P)
{
Transform cameratoworld = kernel_data.cam.cameratoworld;
float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
}
}
-ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P)
{
if (kernel_data.cam.type != CAMERA_PANORAMA) {
Transform worldtocamera = kernel_data.cam.worldtocamera;
@@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
}
}
-ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P)
{
Transform cameratoworld = kernel_data.cam.cameratoworld;
@@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P
}
}
-ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P)
{
if (kernel_data.cam.type != CAMERA_PANORAMA) {
/* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 5eb1bdad02e..960774e0741 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -14,25 +14,22 @@
* limitations under the License.
*/
-#ifndef __KERNEL_COLOR_H__
-#define __KERNEL_COLOR_H__
+#pragma once
#include "util/util_color.h"
CCL_NAMESPACE_BEGIN
-ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz)
+ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz)
{
return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz),
dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz),
dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz));
}
-ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
+ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c)
{
return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
}
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
deleted file mode 100644
index 4a9304a134c..00000000000
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_COMPAT_OPENCL_H__
-#define __KERNEL_COMPAT_OPENCL_H__
-
-#define __KERNEL_GPU__
-#define __KERNEL_OPENCL__
-
-/* no namespaces in opencl */
-#define CCL_NAMESPACE_BEGIN
-#define CCL_NAMESPACE_END
-
-#ifdef __CL_NOINLINE__
-# define ccl_noinline __attribute__((noinline))
-#else
-# define ccl_noinline
-#endif
-
-/* in opencl all functions are device functions, so leave this empty */
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device ccl_noinline
-#define ccl_device_noinline_cpu ccl_device
-#define ccl_may_alias
-#define ccl_static_constant static __constant
-#define ccl_constant __constant
-#define ccl_global __global
-#define ccl_local __local
-#define ccl_local_param __local
-#define ccl_private __private
-#define ccl_restrict restrict
-#define ccl_ref
-#define ccl_align(n) __attribute__((aligned(n)))
-#define ccl_optional_struct_init
-
-#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
-# define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
-#else
-# define ccl_loop_no_unroll
-#endif
-
-#ifdef __SPLIT_KERNEL__
-# define ccl_addr_space __global
-#else
-# define ccl_addr_space
-#endif
-
-#define ATTR_FALLTHROUGH
-
-#define ccl_local_id(d) get_local_id(d)
-#define ccl_global_id(d) get_global_id(d)
-
-#define ccl_local_size(d) get_local_size(d)
-#define ccl_global_size(d) get_global_size(d)
-
-#define ccl_group_id(d) get_group_id(d)
-#define ccl_num_groups(d) get_num_groups(d)
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-# define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
-/* no assert in opencl */
-#define kernel_assert(cond)
-
-/* make_type definitions with opencl style element initializers */
-#ifdef make_float2
-# undef make_float2
-#endif
-#ifdef make_float3
-# undef make_float3
-#endif
-#ifdef make_float4
-# undef make_float4
-#endif
-#ifdef make_int2
-# undef make_int2
-#endif
-#ifdef make_int3
-# undef make_int3
-#endif
-#ifdef make_int4
-# undef make_int4
-#endif
-#ifdef make_uchar4
-# undef make_uchar4
-#endif
-
-#define make_float2(x, y) ((float2)(x, y))
-#define make_float3(x, y, z) ((float3)(x, y, z))
-#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
-#define make_int2(x, y) ((int2)(x, y))
-#define make_int3(x, y, z) ((int3)(x, y, z))
-#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
-#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
-
-/* math functions */
-#define __uint_as_float(x) as_float(x)
-#define __float_as_uint(x) as_uint(x)
-#define __int_as_float(x) as_float(x)
-#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)(x)), ((float)(y)))
-#define fabsf(x) fabs(((float)(x)))
-#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
-#define asinf(x) asin(((float)(x)))
-#define acosf(x) acos(((float)(x)))
-#define atanf(x) atan(((float)(x)))
-#define floorf(x) floor(((float)(x)))
-#define ceilf(x) ceil(((float)(x)))
-#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
-#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
-#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
-#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
-#define fmodf(x, y) fmod((float)(x), (float)(y))
-#define sinhf(x) sinh(((float)(x)))
-#define coshf(x) cosh(((float)(x)))
-#define tanhf(x) tanh(((float)(x)))
-
-/* Use native functions with possibly lower precision for performance,
- * no issues found so far. */
-#if 1
-# define sinf(x) native_sin(((float)(x)))
-# define cosf(x) native_cos(((float)(x)))
-# define tanf(x) native_tan(((float)(x)))
-# define expf(x) native_exp(((float)(x)))
-# define sqrtf(x) native_sqrt(((float)(x)))
-# define logf(x) native_log(((float)(x)))
-# define rcp(x) native_recip(x)
-#else
-# define sinf(x) sin(((float)(x)))
-# define cosf(x) cos(((float)(x)))
-# define tanf(x) tan(((float)(x)))
-# define expf(x) exp(((float)(x)))
-# define sqrtf(x) sqrt(((float)(x)))
-# define logf(x) log(((float)(x)))
-# define rcp(x) recip(x)
-#endif
-
-/* data lookup defines */
-#define kernel_data (*kg->data)
-#define kernel_tex_array(tex) \
- ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
-#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
-
-/* define NULL */
-#ifndef NULL
-# define NULL ((void *)0)
-#endif
-
-/* enable extensions */
-#ifdef __KERNEL_CL_KHR_FP16__
-# pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#include "util/util_half.h"
-#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 3ec0cdbaccc..db4e110bd10 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -14,26 +14,28 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
/* See "Tracing Ray Differentials", Homan Igehy, 1999. */
-ccl_device void differential_transfer(ccl_addr_space differential3 *dP_,
- const differential3 dP,
- float3 D,
- const differential3 dD,
- float3 Ng,
- float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP,
+ const differential3 ray_dP,
+ float3 ray_D,
+ const differential3 ray_dD,
+ float3 surface_Ng,
+ float ray_t)
{
/* ray differential transfer through homogeneous medium, to
* compute dPdx/dy at a shading point from the incoming ray */
- float3 tmp = D / dot(D, Ng);
- float3 tmpx = dP.dx + t * dD.dx;
- float3 tmpy = dP.dy + t * dD.dy;
+ float3 tmp = ray_D / dot(ray_D, surface_Ng);
+ float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx;
+ float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy;
- dP_->dx = tmpx - dot(tmpx, Ng) * tmp;
- dP_->dy = tmpy - dot(tmpy, Ng) * tmp;
+ surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp;
+ surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp;
}
ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
@@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero()
return d;
}
+/* Compact ray differentials that are just a scale to reduce memory usage and
+ * access cost in GPU.
+ *
+ * See above for more accurate reference implementations.
+ *
+ * TODO: also store the more compact version in ShaderData and recompute where
+ * needed? */
+
+ccl_device_forceinline float differential_zero_compact()
+{
+ return 0.0f;
+}
+
+ccl_device_forceinline float differential_make_compact(const differential3 D)
+{
+ return 0.5f * (len(D.dx) + len(D.dy));
+}
+
+ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP,
+ const float ray_dP,
+ const float3 /* ray_D */,
+ const float ray_dD,
+ const float3 surface_Ng,
+ const float ray_t)
+{
+ /* ray differential transfer through homogeneous medium, to
+ * compute dPdx/dy at a shading point from the incoming ray */
+ float scale = ray_dP + ray_t * ray_dD;
+
+ float3 dx, dy;
+ make_orthonormals(surface_Ng, &dx, &dy);
+ surface_dP->dx = dx * scale;
+ surface_dP->dy = dy * scale;
+}
+
+ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI,
+ const float3 D,
+ const float dD)
+{
+ /* compute dIdx/dy at a shading point, we just need to negate the
+ * differential of the ray direction */
+
+ float3 dx, dy;
+ make_orthonormals(D, &dx, &dy);
+
+ dI->dx = dD * dx;
+ dI->dy = dD * dy;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index aebf2ec8e28..d62285d173d 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -14,40 +14,36 @@
* limitations under the License.
*/
+#pragma once
+
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
CCL_NAMESPACE_BEGIN
-/* Direction Emission */
-ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
- ShaderData *emission_sd,
- LightSample *ls,
- ccl_addr_space PathState *state,
- float3 I,
- differential3 dI,
- float t,
- float time)
+/* Evaluate shader on light. */
+ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS,
+ ShaderData *ccl_restrict emission_sd,
+ LightSample *ccl_restrict ls,
+ float time)
{
/* setup shading at emitter */
float3 eval = zero_float3();
if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
- if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+ if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
ls->Ng = -ls->Ng;
}
}
else {
/* Setup shader data and call shader_eval_surface once, better
* for GPU coherence and compile times. */
+ PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
#ifdef __BACKGROUND_MIS__
if (ls->type == LIGHT_BACKGROUND) {
- Ray ray;
- ray.D = ls->D;
- ray.P = ls->P;
- ray.t = 1.0f;
- ray.time = time;
- ray.dP = differential3_zero();
- ray.dD = dI;
-
- shader_setup_from_background(kg, emission_sd, &ray);
+ shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
}
else
#endif
@@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
emission_sd,
ls->P,
ls->Ng,
- I,
+ -ls->D,
ls->shader,
ls->object,
ls->prim,
ls->u,
ls->v,
- t,
+ ls->t,
time,
false,
ls->lamp);
@@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
ls->Ng = emission_sd->Ng;
}
+ PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+ PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+
/* No proper path flag, we're evaluating this for all closures. that's
* weak but we'd have to do multiple evaluations otherwise. */
- path_state_modify_bounce(state, true);
- shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
- path_state_modify_bounce(state, false);
+ shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+ INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION);
/* Evaluate closures. */
#ifdef __BACKGROUND_MIS__
@@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
return eval;
}
-ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- LightSample *ls,
- ccl_addr_space PathState *state,
- Ray *ray,
- BsdfEval *eval,
- bool *is_lamp,
- float rand_terminate)
+/* Test if light sample is from a light or emission from geometry. */
+ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls)
{
- if (ls->pdf == 0.0f)
- return false;
-
- /* todo: implement */
- differential3 dD = differential3_zero();
+ /* return if it's a lamp for shadow pass */
+ return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
+}
- /* evaluate closure */
+/* Early path termination of shadow rays. */
+ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg,
+ const LightSample *ccl_restrict ls,
+ BsdfEval *ccl_restrict eval,
+ const float rand_terminate)
+{
+ if (bsdf_eval_is_zero(eval)) {
+ return true;
+ }
- float3 light_eval = direct_emissive_eval(
- kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time);
+ if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+ float probability = max3(fabs(bsdf_eval_sum(eval))) *
+ kernel_data.integrator.light_inv_rr_threshold;
+ if (probability < 1.0f) {
+ if (rand_terminate >= probability) {
+ return true;
+ }
+ bsdf_eval_mul(eval, 1.0f / probability);
+ }
+ }
- if (is_zero(light_eval))
- return false;
+ return false;
+}
- /* evaluate BSDF at shading point */
+/* This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. The algorithm slightly distorts flat surface
+ * of a triangle. Surface is lifted by amount h along normal n in the incident
+ * point. */
-#ifdef __VOLUME__
- if (sd->prim != PRIM_NONE)
- shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
+ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg,
+ const ShaderData *ccl_restrict sd,
+ float3 Ng)
+{
+ float3 V[3], N[3];
+ triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+ const float u = sd->u, v = sd->v;
+ const float w = 1 - u - v;
+ float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
+ float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
+
+ object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+
+ /* Parabolic approximation */
+ float a = dot(N[2] - N[0], V[0] - V[2]);
+ float b = dot(N[2] - N[1], V[1] - V[2]);
+ float c = dot(N[1] - N[0], V[1] - V[0]);
+ float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
+
+ /* Check flipped normals */
+ if (dot(n, Ng) > 0) {
+ /* Local linear envelope */
+ float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
+ float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
+ float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
+ h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
+ h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
+ h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
+ h = max(min(min(h0, h1), h2), h * 0.5f);
+ }
else {
- float bsdf_pdf;
- shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
- if (ls->shader & SHADER_USE_MIS) {
- /* Multiple importance sampling. */
- float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
- light_eval *= mis_weight;
- }
+ float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
+ float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
+ float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
+ h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
+ h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
+ h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
+ h = min(-min(min(h0, h1), h2), h * 0.5f);
}
-#else
- shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
-#endif
- bsdf_eval_mul3(eval, light_eval / ls->pdf);
-
-#ifdef __PASSES__
- /* use visibility flag to skip lights */
- if (ls->shader & SHADER_EXCLUDE_ANY) {
- if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
- eval->diffuse = zero_float3();
- if (ls->shader & SHADER_EXCLUDE_GLOSSY)
- eval->glossy = zero_float3();
- if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
- eval->transmission = zero_float3();
- if (ls->shader & SHADER_EXCLUDE_SCATTER)
- eval->volume = zero_float3();
- }
-#endif
+ return n * h;
+}
- if (bsdf_eval_is_zero(eval))
- return false;
+/* Ray offset to avoid shadow terminator artifact. */
- if (kernel_data.integrator.light_inv_rr_threshold > 0.0f
-#ifdef __SHADOW_TRICKS__
- && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
-#endif
- ) {
- float probability = max3(fabs(bsdf_eval_sum(eval))) *
- kernel_data.integrator.light_inv_rr_threshold;
- if (probability < 1.0f) {
- if (rand_terminate >= probability) {
- return false;
+ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg,
+ const ShaderData *ccl_restrict sd,
+ float3 L)
+{
+ float NL = dot(sd->N, L);
+ bool transmit = (NL < 0.0f);
+ float3 Ng = (transmit ? -sd->Ng : sd->Ng);
+ float3 P = ray_offset(sd->P, Ng);
+
+ if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+ const float offset_cutoff =
+ kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+ /* Do ray offset (heavy stuff) only for close to be terminated triangles:
+ * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
+ * make a smooth transition near the threshold. */
+ if (offset_cutoff > 0.0f) {
+ float NgL = dot(Ng, L);
+ float offset_amount = 0.0f;
+ if (NL < offset_cutoff) {
+ offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
+ }
+ else {
+ offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
+ }
+ if (offset_amount > 0.0f) {
+ P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount;
}
- bsdf_eval_mul(eval, 1.0f / probability);
}
}
+ return P;
+}
+
+ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd,
+ const LightSample *ccl_restrict ls,
+ const float3 P,
+ Ray *ray)
+{
if (ls->shader & SHADER_CAST_SHADOW) {
/* setup ray */
- ray->P = ray_offset_shadow(kg, sd, ls->D);
+ ray->P = P;
if (ls->t == FLT_MAX) {
/* distant light */
@@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
}
else {
/* other lights, avoid self-intersection */
- ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
+ ray->D = ray_offset(ls->P, ls->Ng) - P;
ray->D = normalize_len(ray->D, &ray->t);
}
-
- ray->dP = sd->dP;
- ray->dD = differential3_zero();
}
else {
/* signal to not cast shadow ray */
+ ray->P = zero_float3();
+ ray->D = zero_float3();
ray->t = 0.0f;
}
- /* return if it's a lamp for shadow pass */
- *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
-
- return true;
+ ray->dP = differential_make_compact(sd->dP);
+ ray->dD = differential_zero_compact();
+ ray->time = sd->time;
}
-/* Indirect Primitive Emission */
-
-ccl_device_noinline_cpu float3 indirect_primitive_emission(
- KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg,
+ const ShaderData *ccl_restrict sd,
+ const LightSample *ccl_restrict ls,
+ Ray *ray)
{
- /* evaluate emissive closure */
- float3 L = shader_emissive_eval(sd);
-
-#ifdef __HAIR__
- if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
- (sd->type & PRIMITIVE_ALL_TRIANGLE))
-#else
- if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#endif
- {
- /* multiple importance sampling, get triangle light pdf,
- * and compute weight with respect to BSDF pdf */
- float pdf = triangle_light_pdf(kg, sd, t);
- float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
- return L * mis_weight;
- }
-
- return L;
+ const float3 P = shadow_ray_offset(kg, sd, ls->D);
+ shadow_ray_setup(sd, ls, P, ray);
}
-/* Indirect Lamp Emission */
-
-ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
- ShaderData *emission_sd,
- ccl_addr_space PathState *state,
- PathRadiance *L,
- Ray *ray,
- float3 throughput)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg,
+ const ShaderData *ccl_restrict sd,
+ const LightSample *ccl_restrict ls,
+ const float3 P,
+ Ray *ray)
{
- for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
- LightSample ls ccl_optional_struct_init;
-
- if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
- continue;
-
-#ifdef __PASSES__
- /* use visibility flag to skip lights */
- if (ls.shader & SHADER_EXCLUDE_ANY) {
- if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
- ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
- ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
- (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
- ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
- ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
- continue;
- }
-#endif
-
- float3 lamp_L = direct_emissive_eval(
- kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
-
-#ifdef __VOLUME__
- if (state->volume_stack[0].shader != SHADER_NONE) {
- /* shadow attenuation */
- Ray volume_ray = *ray;
- volume_ray.t = ls.t;
- float3 volume_tp = one_float3();
- kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
- lamp_L *= volume_tp;
- }
-#endif
-
- if (!(state->flag & PATH_RAY_MIS_SKIP)) {
- /* multiple importance sampling, get regular light pdf,
- * and compute weight with respect to BSDF pdf */
- float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
- lamp_L *= mis_weight;
- }
-
- path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
- }
-}
-
-/* Indirect Background */
-
-ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
- ShaderData *emission_sd,
- ccl_addr_space PathState *state,
- ccl_global float *buffer,
- ccl_addr_space Ray *ray)
-{
-#ifdef __BACKGROUND__
- int shader = kernel_data.background.surface_shader;
-
- /* Use visibility flag to skip lights. */
- if (shader & SHADER_EXCLUDE_ANY) {
- if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
- ((shader & SHADER_EXCLUDE_GLOSSY) &&
- ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
- (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
- ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
- ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
- ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
- return zero_float3();
- }
-
- /* Evaluate background shader. */
- float3 L = zero_float3();
- if (!shader_constant_emission_eval(kg, shader, &L)) {
-# ifdef __SPLIT_KERNEL__
- Ray priv_ray = *ray;
- shader_setup_from_background(kg, emission_sd, &priv_ray);
-# else
- shader_setup_from_background(kg, emission_sd, ray);
-# endif
-
- path_state_modify_bounce(state, true);
- shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
- path_state_modify_bounce(state, false);
-
- L = shader_background_eval(emission_sd);
- }
-
- /* Background MIS weights. */
-# ifdef __BACKGROUND_MIS__
- /* Check if background light exists or if we should skip pdf. */
- if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
- /* multiple importance sampling, get background light pdf for ray
- * direction, and compute weight with respect to BSDF pdf */
- float pdf = background_light_pdf(kg, ray->P, ray->D);
- float mis_weight = power_heuristic(state->ray_pdf, pdf);
-
- return L * mis_weight;
- }
-# endif
-
- return L;
-#else
- return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+ shadow_ray_setup(sd, ls, P, ray);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index a6fd4f1dc7e..715d764fb31 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -14,119 +14,516 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
-ccl_device float4 film_get_pass_result(KernelGlobals *kg,
- ccl_global float *buffer,
- float sample_scale,
- int index,
- bool use_display_sample_scale)
-{
- float4 pass_result;
-
- int display_pass_stride = kernel_data.film.display_pass_stride;
- int display_pass_components = kernel_data.film.display_pass_components;
-
- if (display_pass_components == 4) {
- float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
- index * kernel_data.film.pass_stride);
- float alpha = use_display_sample_scale ?
- (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
- 1.0f;
-
- pass_result = make_float4(in.x, in.y, in.z, alpha);
-
- int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
- if (display_divide_pass_stride != -1) {
- ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
- index * kernel_data.film.pass_stride);
- float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
- float4_to_float3(*divide_in));
- pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
- }
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
- if (kernel_data.film.use_display_exposure) {
- float exposure = kernel_data.film.exposure;
- pass_result *= make_float4(exposure, exposure, exposure, 1.0f);
- }
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+ return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer)
+{
+ if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+ return kfilm_convert->scale;
+ }
+
+ if (kfilm_convert->pass_use_filter) {
+ const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+ return 1.0f / sample_count;
+ }
+
+ return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer)
+{
+ if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+ return kfilm_convert->scale_exposure;
+ }
+
+ const float scale = film_get_scale(kfilm_convert, buffer);
+
+ if (kfilm_convert->pass_use_exposure) {
+ return scale * kfilm_convert->exposure;
+ }
+
+ return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict scale,
+ float *ccl_restrict scale_exposure)
+{
+ if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+ *scale = kfilm_convert->scale;
+ *scale_exposure = kfilm_convert->scale_exposure;
+ return true;
+ }
+
+ const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+ if (!sample_count) {
+ *scale = 0.0f;
+ *scale_exposure = 0.0f;
+ return false;
+ }
+
+ if (kfilm_convert->pass_use_filter) {
+ *scale = 1.0f / sample_count;
}
- else if (display_pass_components == 1) {
- ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
- index * kernel_data.film.pass_stride);
- pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+ else {
+ *scale = 1.0f;
+ }
+
+ if (kfilm_convert->pass_use_exposure) {
+ *scale_exposure = *scale * kfilm_convert->exposure;
+ }
+ else {
+ *scale_exposure = *scale;
+ }
+
+ return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 1);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+ const float f = *in;
+
+ pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 1);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+ const float f = *in;
+
+ /* Note that we accumulate 1 - mist in the kernel to avoid having to
+ * track the mist values in the integrator state. */
+ pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+ * meaningful value when adaptive sampler stopped rendering image way before the maximum
+ * number of samples was reached (for examples when number of samples is set to 0 in
+ * viewport). */
+
+ kernel_assert(kfilm_convert->num_components >= 1);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+ const float f = *in;
+
+ pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 1);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+ const float f = *in;
+
+ pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 3);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ /* Read light pass. */
+ const float *in = buffer + kfilm_convert->pass_offset;
+ float3 f = make_float3(in[0], in[1], in[2]);
+
+ /* Optionally add indirect light pass. */
+ if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+ const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+ const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+ f += f_indirect;
+ }
+
+ /* Optionally divide out color. */
+ if (kfilm_convert->pass_divide != PASS_UNUSED) {
+ const float *in_divide = buffer + kfilm_convert->pass_divide;
+ const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+ f = safe_divide_even_color(f, f_divide);
+
+ /* Exposure only, sample scale cancels out. */
+ f *= kfilm_convert->exposure;
+ }
+ else {
+ /* Sample scale and exposure. */
+ f *= film_get_scale_exposure(kfilm_convert, buffer);
+ }
+
+ pixel[0] = f.x;
+ pixel[1] = f.y;
+ pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 3);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+
+ const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+ pixel[0] = f.x;
+ pixel[1] = f.y;
+ pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components == 4);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+ kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+ const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+ const float weight = in_weight[0];
+ const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+ const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+ pixel[0] = motion.x;
+ pixel[1] = motion.y;
+ pixel[2] = motion.z;
+ pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components == 4);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ const float scale = film_get_scale(kfilm_convert, buffer);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+
+ const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+ /* x and z contain integer IDs, don't rescale them.
+ * y and w contain matte weights, they get scaled. */
+ pixel[0] = f.x;
+ pixel[1] = f.y * scale;
+ pixel[2] = f.z;
+ pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components == 4);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ float scale, scale_exposure;
+ film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+ const float *in = buffer + kfilm_convert->pass_offset;
+
+ const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+ const float alpha = in[3] * scale;
+
+ pixel[0] = color.x;
+ pixel[1] = color.y;
+ pixel[2] = color.z;
+ pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict
+ kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components == 4);
+
+ /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+ kernel_assert(kfilm_convert->num_components == 4);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+ float scale, scale_exposure;
+ if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+ pixel[0] = 0.0f;
+ pixel[1] = 0.0f;
+ pixel[2] = 0.0f;
+ pixel[3] = 0.0f;
+ return;
}
- return pass_result;
+ const float *in = buffer + kfilm_convert->pass_offset;
+
+ const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+ const float alpha = in[3] * scale;
+
+ pixel[0] = color.x;
+ pixel[1] = color.y;
+ pixel[2] = color.z;
+ pixel[3] = film_transparency_to_alpha(alpha);
}
-ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3
+film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer)
{
- float4 result;
+ kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
- /* Conversion to SRGB. */
- result.x = color_linear_to_srgb(rgba_in.x * scale);
- result.y = color_linear_to_srgb(rgba_in.y * scale);
- result.z = color_linear_to_srgb(rgba_in.z * scale);
+ float scale, scale_exposure;
+ film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
- /* Clamp since alpha might be > 1.0 due to Russian roulette. */
- result.w = saturate(rgba_in.w * scale);
+ ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
- return result;
+ const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+ return pixel;
}
-ccl_device uchar4 film_float_to_byte(float4 color)
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
{
- uchar4 result;
+ float x, y, z;
- /* simple float to byte conversion */
- result.x = (uchar)(saturate(color.x) * 255.0f);
- result.y = (uchar)(saturate(color.y) * 255.0f);
- result.z = (uchar)(saturate(color.z) * 255.0f);
- result.w = (uchar)(saturate(color.w) * 255.0f);
+ x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+ y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+ z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
- return result;
+ return make_float3(x, y, z);
}
-ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride)
+ccl_device_inline float3
+film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer)
{
- /* buffer offset */
- int index = offset + x + y * stride;
+ /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+ * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+ * to be calculated as division). */
- bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
- float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+ if (kfilm_convert->is_denoised) {
+ return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+ }
- /* map colors */
- float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
- uchar4 uchar_result = film_float_to_byte(float_result);
+ kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
- rgba += index;
- *rgba = uchar_result;
+ /* If there is no shadow catcher object in this pixel, there is no modification of the light
+ * needed, so return one. */
+ ccl_global const float *in_catcher_sample_count =
+ buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+ const float num_samples = in_catcher_sample_count[0];
+ if (num_samples == 0.0f) {
+ return one_float3();
+ }
+
+ kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+ ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+ /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+ * shadow catcher objects in the scene. In this case there will be no auxiliary passes required
+ * for the devision (to save up memory). So delay the asserts to this point so that the number of
+ * samples check handles such configuration. */
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+ kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+ kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+ ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+ ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+ /* No scaling needed. The integration works in way that number of samples in the combined and
+ * shadow catcher passes are the same, and exposure is canceled during the division. */
+ const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+ const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+ const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+ /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+ * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+ * to contain matte objects, we subtract matte objects contribution here. This is the same as if
+ * the matte objects were not accumulated to the combined pass. */
+ const float3 combined_no_matte = color_combined - color_matte;
+
+ const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+ const float scale = film_get_scale(kfilm_convert, buffer);
+ const float transparency = in_combined[3] * scale;
+ const float alpha = film_transparency_to_alpha(transparency);
+
+ /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+ * artifacts which are happening on an edge of a shadow catcher when using transparent film.
+ * Note that we treat shadow catcher as straight alpha here because alpha got canceled out
+ * during the division. */
+ const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+ return pixel;
}
-ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride)
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer)
{
- /* buffer offset */
- int index = offset + x + y * stride;
+ /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+ * is possible.
+ *
+ * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+ * and then alpha-overing synthetic objects on top). */
- bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
- float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+ kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+ kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+ float scale, scale_exposure;
+ if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+ const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+ const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+ const float transparency = in_matte[3] * scale;
+ const float alpha = saturate(1.0f - transparency);
+
+ const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+ if (kfilm_convert->use_approximate_shadow_catcher_background) {
+ kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+ ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+ const float3 color_background = make_float3(
+ in_background[0], in_background[1], in_background[2]) *
+ scale_exposure;
+ const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+ return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+ }
- ccl_global half *out = (ccl_global half *)rgba + index * 4;
- float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+ return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components >= 3);
+
+ const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+ pixel[0] = pixel_value.x;
+ pixel[1] = pixel_value.y;
+ pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+ const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+ buffer);
+
+ pixel[0] = pixel_value.x;
+ pixel[1] = pixel_value.y;
+ pixel[2] = pixel_value.z;
+ if (kfilm_convert->num_components == 4) {
+ pixel[3] = pixel_value.w;
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+ const KernelFilmConvert *ccl_restrict kfilm_convert,
+ ccl_global const float *ccl_restrict buffer,
+ float *ccl_restrict pixel)
+{
+ if (kfilm_convert->show_active_pixels &&
+ kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+ if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+ const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+ const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+ pixel[0] = mix_rgb.x;
+ pixel[1] = mix_rgb.y;
+ pixel[2] = mix_rgb.z;
+ }
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
deleted file mode 100644
index 70aed6d54ed..00000000000
--- a/intern/cycles/kernel/kernel_globals.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#ifndef __KERNEL_GLOBALS_H__
-#define __KERNEL_GLOBALS_H__
-
-#include "kernel/kernel_profiling.h"
-
-#ifdef __KERNEL_CPU__
-# include "util/util_map.h"
-# include "util/util_vector.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-# include "util/util_atomic.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
- * these are really just standard arrays. We can't use actually globals because
- * multiple renders may be running inside the same process. */
-
-#ifdef __KERNEL_CPU__
-
-# ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-# endif
-
-typedef unordered_map<float, float> CoverageMap;
-
-struct Intersection;
-struct VolumeStep;
-
-typedef struct KernelGlobals {
-# define KERNEL_TEX(type, name) texture<type> name;
-# include "kernel/kernel_textures.h"
-
- KernelData __data;
-
-# ifdef __OSL__
- /* On the CPU, we also have the OSL globals here. Most data structures are shared
- * with SVM, the difference is in the shaders and object/mesh attributes. */
- OSLGlobals *osl;
- OSLShadingSystem *osl_ss;
- OSLThreadData *osl_tdata;
-# endif
-
- /* **** Run-time data **** */
-
- /* Heap-allocated storage for transparent shadows intersections. */
- Intersection *transparent_shadow_intersections;
-
- /* Storage for decoupled volume steps. */
- VolumeStep *decoupled_volume_steps[2];
- int decoupled_volume_steps_index;
-
- /* A buffer for storing per-pixel coverage for Cryptomatte. */
- CoverageMap *coverage_object;
- CoverageMap *coverage_material;
- CoverageMap *coverage_asset;
-
- /* split kernel */
- SplitData split_data;
- SplitParams split_param_data;
-
- int2 global_size;
- int2 global_id;
-
- ProfilingState profiler;
-} KernelGlobals;
-
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_OPTIX__
-
-typedef struct ShaderParams {
- uint4 *input;
- float4 *output;
- int type;
- int filter;
- int sx;
- int offset;
- int sample;
-} ShaderParams;
-
-typedef struct KernelParams {
- WorkTile tile;
- KernelData data;
- ShaderParams shader;
-# define KERNEL_TEX(type, name) const type *name;
-# include "kernel/kernel_textures.h"
-} KernelParams;
-
-typedef struct KernelGlobals {
-# ifdef __VOLUME__
- VolumeState volume_state;
-# endif
- Intersection hits_stack[64];
-} KernelGlobals;
-
-extern "C" __constant__ KernelParams __params;
-
-#else /* __KERNEL_OPTIX__ */
-
-/* For CUDA, constant memory textures must be globals, so we can't put them
- * into a struct. As a result we don't actually use this struct and use actual
- * globals and simply pass along a NULL pointer everywhere, which we hope gets
- * optimized out. */
-
-# ifdef __KERNEL_CUDA__
-
-__constant__ KernelData __data;
-typedef struct KernelGlobals {
- /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
- Intersection hits_stack[64];
-} KernelGlobals;
-
-# define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-# include "kernel/kernel_textures.h"
-
-# endif /* __KERNEL_CUDA__ */
-
-#endif /* __KERNEL_OPTIX__ */
-
-/* OpenCL */
-
-#ifdef __KERNEL_OPENCL__
-
-# define KERNEL_TEX(type, name) typedef type name##_t;
-# include "kernel/kernel_textures.h"
-
-typedef ccl_addr_space struct KernelGlobals {
- ccl_constant KernelData *data;
- ccl_global char *buffers[8];
-
-# define KERNEL_TEX(type, name) TextureInfo name;
-# include "kernel/kernel_textures.h"
-
-# ifdef __SPLIT_KERNEL__
- SplitData split_data;
- SplitParams split_param_data;
-# endif
-} KernelGlobals;
-
-# define KERNEL_BUFFER_PARAMS \
- ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \
- ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \
- ccl_global char *buffer6, ccl_global char *buffer7
-
-# define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
-
-ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
-{
-# ifdef __SPLIT_KERNEL__
- if (ccl_local_id(0) + ccl_local_id(1) == 0)
-# endif
- {
- kg->buffers[0] = buffer0;
- kg->buffers[1] = buffer1;
- kg->buffers[2] = buffer2;
- kg->buffers[3] = buffer3;
- kg->buffers[4] = buffer4;
- kg->buffers[5] = buffer5;
- kg->buffers[6] = buffer6;
- kg->buffers[7] = buffer7;
- }
-
-# ifdef __SPLIT_KERNEL__
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-# endif
-}
-
-ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
-{
-# ifdef __SPLIT_KERNEL__
- if (ccl_local_id(0) + ccl_local_id(1) == 0)
-# endif
- {
- ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0];
-
-# define KERNEL_TEX(type, name) kg->name = *(info++);
-# include "kernel/kernel_textures.h"
- }
-
-# ifdef __SPLIT_KERNEL__
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-# endif
-}
-
-#endif /* __KERNEL_OPENCL__ */
-
-/* Interpolated lookup table access */
-
-ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
-{
- x = saturate(x) * (size - 1);
-
- int index = min(float_to_int(x), size - 1);
- int nindex = min(index + 1, size - 1);
- float t = x - index;
-
- float data0 = kernel_tex_fetch(__lookup_table, index + offset);
- if (t == 0.0f)
- return data0;
-
- float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
- return (1.0f - t) * data0 + t * data1;
-}
-
-ccl_device float lookup_table_read_2D(
- KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
-{
- y = saturate(y) * (ysize - 1);
-
- int index = min(float_to_int(y), ysize - 1);
- int nindex = min(index + 1, ysize - 1);
- float t = y - index;
-
- float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
- if (t == 0.0f)
- return data0;
-
- float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
- return (1.0f - t) * data0 + t * data1;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index 1ca42e933d1..ed01f494f98 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -14,8 +14,18 @@
* limitations under the License.
*/
+#pragma once
+
CCL_NAMESPACE_BEGIN
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+ float x;
+ float y;
+} IDPassBufferElement;
+
ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
int num_slots,
float id,
@@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
}
for (int slot = 0; slot < num_slots; slot++) {
- ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+ ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
#ifdef __ATOMIC_PASS_WRITE__
/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
if (id_buffer[slot].x == ID_NONE) {
@@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
{
- ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+ ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
for (int slot = 1; slot < num_slots; ++slot) {
if (id_buffer[slot].x == ID_NONE) {
return;
@@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
/* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
int i = slot;
while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
- float2 swap = id_buffer[i];
+ const IDPassBufferElement swap = id_buffer[i];
id_buffer[i] = id_buffer[i - 1];
id_buffer[i - 1] = swap;
--i;
@@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
}
}
-#ifdef __KERNEL_GPU__
/* post-sorting for Cryptomatte */
-ccl_device void kernel_cryptomatte_post(
- KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index)
{
- if (sample - 1 == kernel_data.integrator.aa_samples) {
- int index = offset + x + y * stride;
- int pass_stride = kernel_data.film.pass_stride;
- ccl_global float *cryptomatte_buffer = buffer + index * pass_stride +
- kernel_data.film.pass_cryptomatte;
- kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
- }
+ const int pass_stride = kernel_data.film.pass_stride;
+ const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+ ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+ kernel_data.film.pass_cryptomatte;
+ kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
}
-#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f4e60a807f7..354e8115538 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,93 +14,27 @@
* limitations under the License.
*/
-/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
-
+#pragma once
CCL_NAMESPACE_BEGIN
-/* "Correlated Multi-Jittered Sampling"
- * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
-
-/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */
-#define CMJ_RANDOM_OFFSET_LIMIT 4096
-
-ccl_device_inline bool cmj_is_pow2(int i)
+ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
{
- return (i > 1) && ((i & (i - 1)) == 0);
-}
+ x += seed;
+ x ^= (x * 0x6c50b47cu);
+ x ^= x * 0xb82f1e52u;
+ x ^= x * 0xc7afe638u;
+ x ^= x * 0x8d22f6e6u;
-ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
-{
- return (a & (b - 1));
+ return x;
}
-/* b must be > 1 */
-ccl_device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
{
- kernel_assert(b > 1);
- return a >> count_trailing_zeros(b);
-}
+ x = reverse_integer_bits(x);
+ x = laine_karras_permutation(x, seed);
+ x = reverse_integer_bits(x);
-ccl_device_inline uint cmj_w_mask(uint w)
-{
- kernel_assert(w > 1);
- return ((1 << (32 - count_leading_zeros(w))) - 1);
-}
-
-ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
-{
- uint w = l - 1;
-
- if ((l & w) == 0) {
- /* l is a power of two (fast) */
- i ^= p;
- i *= 0xe170893d;
- i ^= p >> 16;
- i ^= (i & w) >> 4;
- i ^= p >> 8;
- i *= 0x0929eb3f;
- i ^= p >> 23;
- i ^= (i & w) >> 1;
- i *= 1 | p >> 27;
- i *= 0x6935fa69;
- i ^= (i & w) >> 11;
- i *= 0x74dcb303;
- i ^= (i & w) >> 2;
- i *= 0x9e501cc3;
- i ^= (i & w) >> 2;
- i *= 0xc860a3df;
- i &= w;
- i ^= i >> 5;
-
- return (i + p) & w;
- }
- else {
- /* l is not a power of two (slow) */
- w = cmj_w_mask(w);
-
- do {
- i ^= p;
- i *= 0xe170893d;
- i ^= p >> 16;
- i ^= (i & w) >> 4;
- i ^= p >> 8;
- i *= 0x0929eb3f;
- i ^= p >> 23;
- i ^= (i & w) >> 1;
- i *= 1 | p >> 27;
- i *= 0x6935fa69;
- i ^= (i & w) >> 11;
- i *= 0x74dcb303;
- i ^= (i & w) >> 2;
- i *= 0x9e501cc3;
- i ^= (i & w) >> 2;
- i *= 0xc860a3df;
- i &= w;
- i ^= i >> 5;
- } while (i >= l);
-
- return (i + p) % l;
- }
+ return x;
}
ccl_device_inline uint cmj_hash(uint i, uint p)
@@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p)
return cmj_hash(i, p) * (1.0f / 4294967808.0f);
}
-#ifdef __CMJ__
-ccl_device float cmj_sample_1D(int s, int N, int p)
+ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
{
- kernel_assert(s < N);
-
- uint x = cmj_permute(s, N, p * 0x68bc21eb);
- float jx = cmj_randfloat(s, p * 0x967a889b);
-
- float invN = 1.0f / N;
- return (x + jx) * invN;
+ return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
}
-/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
-ccl_device_inline int cmj_isqrt(int value)
+ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
{
-# if defined(__KERNEL_CUDA__)
- return float_to_int(__fsqrt_ru(value));
-# elif defined(__KERNEL_GPU__)
- return float_to_int(sqrtf(value));
-# else
- /* This is a work around for fast-math on CPU which might replace sqrtf()
- * with am approximated version.
- */
- return float_to_int(sqrtf(value) + 1e-6f);
-# endif
-}
+ /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+ * the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
+ * independently). */
+
+ /* Perform Owen shuffle of the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+ const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+ const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+# warning "Using XOR shuffle."
+ const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+ const uint s = nested_uniform_scramble(sample, rv);
+#endif
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
-{
- kernel_assert(s < N);
+ /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+ const uint sample_set = s / NUM_PMJ_SAMPLES;
+ const uint d = (dimension + sample_set);
+ const uint dim = d % NUM_PMJ_PATTERNS;
+ int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+ float fx = kernel_tex_fetch(__sample_pattern_lut, index);
- int m = cmj_isqrt(N);
- int n = (N - 1) / m + 1;
- float invN = 1.0f / N;
- float invm = 1.0f / m;
- float invn = 1.0f / n;
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+ /* Use Cranley-Patterson rotation to displace the sample pattern. */
+# ifdef _SIMPLE_HASH_
+ float dx = cmj_randfloat_simple(d, rng_hash);
+# else
+ /* Only jitter within the grid interval. */
+ float dx = cmj_randfloat(d, rng_hash);
+# endif
+ fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+ fx = fx - floorf(fx);
- s = cmj_permute(s, N, p * 0x51633e2d);
+#else
+# warning "Not using Cranley-Patterson Rotation."
+#endif
- int sdivm, smodm;
+ return fx;
+}
- if (cmj_is_pow2(m)) {
- sdivm = cmj_fast_div_pow2(s, m);
- smodm = cmj_fast_mod_pow2(s, m);
- }
- else {
- /* Doing `s * inmv` gives precision issues here. */
- sdivm = s / m;
- smodm = s - sdivm * m;
- }
+ccl_device void pmj_sample_2D(
+ const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y)
+{
+ /* Perform a shuffle on the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+ const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+ const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+# warning "Using XOR shuffle."
+ const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+ const uint s = nested_uniform_scramble(sample, rv);
+#endif
- uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
- uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+ /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+ const uint sample_set = s / NUM_PMJ_SAMPLES;
+ const uint d = (dimension + sample_set);
+ const uint dim = d % NUM_PMJ_PATTERNS;
+ int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
- float jx = cmj_randfloat(s, p * 0x967a889b);
- float jy = cmj_randfloat(s, p * 0x368cc8b7);
+ float fx = kernel_tex_fetch(__sample_pattern_lut, index);
+ float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
- *fx = (sx + (sy + jx) * invn) * invm;
- *fy = (s + jy) * invN;
-}
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+ /* Use Cranley-Patterson rotation to displace the sample pattern. */
+# ifdef _SIMPLE_HASH_
+ float dx = cmj_randfloat_simple(d, rng_hash);
+ float dy = cmj_randfloat_simple(d + 1, rng_hash);
+# else
+ float dx = cmj_randfloat(d, rng_hash);
+ float dy = cmj_randfloat(d + 1, rng_hash);
+# endif
+ /* Only jitter within the grid cells. */
+ fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
+ fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
+ fx = fx - floorf(fx);
+ fy = fy - floorf(fy);
+#else
+# warning "Not using Cranley Patterson Rotation."
#endif
-ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
- /* Fallback to random */
- if (sample >= NUM_PMJ_SAMPLES) {
- const int p = rng_hash + dimension;
- return cmj_randfloat(sample, p);
- }
- else {
- const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
- const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
- return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
- }
-}
-
-ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
- if (sample >= NUM_PMJ_SAMPLES) {
- const int p = rng_hash + dimension;
- const float fx = cmj_randfloat(sample, p);
- const float fy = cmj_randfloat(sample, p + 1);
- return make_float2(fx, fy);
- }
- else {
- const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
- const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
- const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
- const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
- const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
- 1.0f;
- return make_float2(fx, fy);
- }
+ (*x) = fx;
+ (*y) = fy;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 42a834d2ce3..52f641634b9 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,7 +14,14 @@
* limitations under the License.
*/
+#pragma once
+
+#include "geom/geom.h"
+
#include "kernel_light_background.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+#include "kernel_types.h"
CCL_NAMESPACE_BEGIN
@@ -37,10 +44,22 @@ typedef struct LightSample {
/* Regular Light */
-ccl_device_inline bool lamp_light_sample(
- KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_inline bool light_sample(const KernelGlobals *kg,
+ const int lamp,
+ const float randu,
+ const float randv,
+ const float3 P,
+ const int path_flag,
+ LightSample *ls)
{
const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+ if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+ if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+ return false;
+ }
+ }
+
LightType type = (LightType)klight->type;
ls->type = type;
ls->shader = klight->shader_id;
@@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample(
ls->u = randu;
ls->v = randv;
+ if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) {
+ /* Distant lights in a volume get a dummy sample, position will not actually
+ * be used in that case. Only when sampling from a specific scatter position
+ * do we actually need to evaluate these. */
+ ls->P = zero_float3();
+ ls->Ng = zero_float3();
+ ls->D = zero_float3();
+ ls->pdf = true;
+ ls->t = FLT_MAX;
+ return true;
+ }
+
if (type == LIGHT_DISTANT) {
/* distant light */
float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
@@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample(
float invarea = fabsf(klight->area.invarea);
bool is_round = (klight->area.invarea < 0.0f);
- if (dot(ls->P - P, Ng) > 0.0f) {
- return false;
+ if (!in_volume_segment) {
+ if (dot(ls->P - P, Ng) > 0.0f) {
+ return false;
+ }
}
float3 inplane;
- if (is_round) {
+ if (is_round || in_volume_segment) {
inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
ls->P += inplane;
ls->pdf = invarea;
@@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample(
return (ls->pdf > 0.0f);
}
-ccl_device bool lamp_light_eval(
- KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg,
+ const Ray *ccl_restrict ray,
+ Intersection *ccl_restrict isect,
+ const int last_prim,
+ const int last_object,
+ const int last_type,
+ const int path_flag)
{
- const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
- LightType type = (LightType)klight->type;
- ls->type = type;
- ls->shader = klight->shader_id;
- ls->object = PRIM_NONE;
- ls->prim = PRIM_NONE;
- ls->lamp = lamp;
- /* todo: missing texture coordinates */
- ls->u = 0.0f;
- ls->v = 0.0f;
+ for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+ const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
- if (!(ls->shader & SHADER_USE_MIS))
- return false;
+ if (path_flag & PATH_RAY_CAMERA) {
+ if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
+ continue;
+ }
+ }
+ else {
+ if (!(klight->shader_id & SHADER_USE_MIS)) {
+ continue;
+ }
+ }
- if (type == LIGHT_DISTANT) {
- /* distant light */
- float radius = klight->distant.radius;
+ if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+ if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+ continue;
+ }
+ }
- if (radius == 0.0f)
- return false;
- if (t != FLT_MAX)
- return false;
+ LightType type = (LightType)klight->type;
+ float t = 0.0f, u = 0.0f, v = 0.0f;
- /* a distant light is infinitely far away, but equivalent to a disk
- * shaped light exactly 1 unit away from the current shading point.
- *
- * radius t^2/cos(theta)
- * <----------> t = sqrt(1^2 + tan(theta)^2)
- * tan(th) area = radius*radius*pi
- * <----->
- * \ | (1 + tan(theta)^2)/cos(theta)
- * \ | (1 + tan(acos(cos(theta)))^2)/cos(theta)
- * t \th| 1 simplifies to
- * \-| 1/(cos(theta)^3)
- * \| magic!
- * P
- */
+ if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+ /* Sphere light. */
+ const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+ const float radius = klight->spot.radius;
+ if (radius == 0.0f) {
+ continue;
+ }
- float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
- float costheta = dot(-lightD, D);
- float cosangle = klight->distant.cosangle;
+ float3 P;
+ if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) {
+ continue;
+ }
+ }
+ else if (type == LIGHT_AREA) {
+ /* Area light. */
+ const float invarea = fabsf(klight->area.invarea);
+ const bool is_round = (klight->area.invarea < 0.0f);
+ if (invarea == 0.0f) {
+ continue;
+ }
- if (costheta < cosangle)
- return false;
+ const float3 axisu = make_float3(
+ klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+ const float3 axisv = make_float3(
+ klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+ const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
- ls->P = -D;
- ls->Ng = -D;
- ls->D = D;
- ls->t = FLT_MAX;
+ /* One sided. */
+ if (dot(ray->D, Ng) >= 0.0f) {
+ continue;
+ }
- /* compute pdf */
- float invarea = klight->distant.invarea;
- ls->pdf = invarea / (costheta * costheta * costheta);
- ls->eval_fac = ls->pdf;
+ const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+ float3 P;
+ if (!ray_quad_intersect(
+ ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+ continue;
+ }
+ }
+ else {
+ continue;
+ }
+
+ if (t < isect->t &&
+ !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) {
+ isect->t = t;
+ isect->u = u;
+ isect->v = v;
+ isect->type = PRIMITIVE_LAMP;
+ isect->prim = lamp;
+ isect->object = OBJECT_NONE;
+ }
+ }
+
+ return isect->prim != PRIM_NONE;
+}
+
+ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg,
+ const float3 ray_D,
+ const int lamp,
+ LightSample *ccl_restrict ls)
+{
+ const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+ const int shader = klight->shader_id;
+ const float radius = klight->distant.radius;
+ const LightType type = (LightType)klight->type;
+
+ if (type != LIGHT_DISTANT) {
+ return false;
+ }
+ if (!(shader & SHADER_USE_MIS)) {
+ return false;
+ }
+ if (radius == 0.0f) {
+ return false;
}
- else if (type == LIGHT_POINT || type == LIGHT_SPOT) {
- float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
- float radius = klight->spot.radius;
+ /* a distant light is infinitely far away, but equivalent to a disk
+ * shaped light exactly 1 unit away from the current shading point.
+ *
+ * radius t^2/cos(theta)
+ * <----------> t = sqrt(1^2 + tan(theta)^2)
+ * tan(th) area = radius*radius*pi
+ * <----->
+ * \ | (1 + tan(theta)^2)/cos(theta)
+ * \ | (1 + tan(acos(cos(theta)))^2)/cos(theta)
+ * t \th| 1 simplifies to
+ * \-| 1/(cos(theta)^3)
+ * \| magic!
+ * P
+ */
+
+ float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+ float costheta = dot(-lightD, ray_D);
+ float cosangle = klight->distant.cosangle;
+
+ if (costheta < cosangle)
+ return false;
- /* sphere light */
- if (radius == 0.0f)
- return false;
+ ls->type = type;
+ ls->shader = klight->shader_id;
+ ls->object = PRIM_NONE;
+ ls->prim = PRIM_NONE;
+ ls->lamp = lamp;
+ /* todo: missing texture coordinates */
+ ls->u = 0.0f;
+ ls->v = 0.0f;
+ ls->t = FLT_MAX;
+ ls->P = -ray_D;
+ ls->Ng = -ray_D;
+ ls->D = ray_D;
+
+ /* compute pdf */
+ float invarea = klight->distant.invarea;
+ ls->pdf = invarea / (costheta * costheta * costheta);
+ ls->pdf *= kernel_data.integrator.pdf_lights;
+ ls->eval_fac = ls->pdf;
- if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) {
- return false;
- }
+ return true;
+}
- ls->Ng = -D;
- ls->D = D;
+ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg,
+ const Intersection *ccl_restrict isect,
+ const float3 ray_P,
+ const float3 ray_D,
+ LightSample *ccl_restrict ls)
+{
+ const int lamp = isect->prim;
+ const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+ LightType type = (LightType)klight->type;
+ ls->type = type;
+ ls->shader = klight->shader_id;
+ ls->object = PRIM_NONE;
+ ls->prim = PRIM_NONE;
+ ls->lamp = lamp;
+ /* todo: missing texture coordinates */
+ ls->t = isect->t;
+ ls->P = ray_P + ray_D * ls->t;
+ ls->D = ray_D;
+
+ if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+ ls->Ng = -ray_D;
float invarea = klight->spot.invarea;
ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval(
ls->eval_fac *= spot_light_attenuation(
dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
- if (ls->eval_fac == 0.0f)
+ if (ls->eval_fac == 0.0f) {
return false;
+ }
}
float2 uv = map_to_sphere(ls->Ng);
ls->u = uv.x;
@@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval(
else if (type == LIGHT_AREA) {
/* area light */
float invarea = fabsf(klight->area.invarea);
- bool is_round = (klight->area.invarea < 0.0f);
- if (invarea == 0.0f)
- return false;
float3 axisu = make_float3(
klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
float3 axisv = make_float3(
klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
- /* one sided */
- if (dot(D, Ng) >= 0.0f)
- return false;
-
float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
- if (!ray_quad_intersect(
- P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) {
- return false;
- }
-
- ls->D = D;
+ ls->u = isect->u;
+ ls->v = isect->v;
+ ls->D = ray_D;
ls->Ng = Ng;
+
+ const bool is_round = (klight->area.invarea < 0.0f);
if (is_round) {
- ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+ ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
}
else {
float3 sample_axisu = axisu;
@@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval(
if (klight->area.tan_spread > 0.0f) {
if (!light_spread_clamp_area_light(
- P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+ ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
return false;
}
}
- ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
+ ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
}
ls->eval_fac = 0.25f * invarea;
@@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval(
}
}
else {
+ kernel_assert(!"Invalid lamp type in light_sample_from_intersection");
return false;
}
@@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval(
/* returns true if the triangle is has motion blur or an instancing transform applied */
ccl_device_inline bool triangle_world_space_vertices(
- KernelGlobals *kg, int object, int prim, float time, float3 V[3])
+ const KernelGlobals *kg, int object, int prim, float time, float3 V[3])
{
bool has_motion = false;
const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices(
return has_motion;
}
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
+ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg,
const float3 Ng,
const float3 I,
float t)
@@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
return t * t * pdf / cos_pi;
}
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg,
+ const ShaderData *sd,
+ float t)
{
/* A naive heuristic to decide between costly solid angle sampling
* and simple area sampling, comparing the distance to the triangle plane
@@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s
}
}
-ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
+template<bool in_volume_segment>
+ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg,
int prim,
int object,
float randu,
@@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
- if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+ if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
/* see James Arvo, "Stratified Sampling of Spherical Triangles"
* http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
@@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
/* Light Distribution */
-ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
+ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu)
{
/* This is basically std::upper_bound as used by PBRT, to find a point light or
* triangle to emit from, proportional to area. a good improvement would be to
@@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
/* Generic Light */
-ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg,
+ int index,
+ int bounce)
{
return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
}
-ccl_device_noinline bool light_sample(KernelGlobals *kg,
- int lamp,
- float randu,
- float randv,
- float time,
- float3 P,
- int bounce,
- LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg,
+ float randu,
+ const float randv,
+ const float time,
+ const float3 P,
+ const int bounce,
+ const int path_flag,
+ LightSample *ls)
{
- if (lamp < 0) {
- /* sample index */
- int index = light_distribution_sample(kg, &randu);
-
- /* fetch light data */
- const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(
- __light_distribution, index);
- int prim = kdistribution->prim;
-
- if (prim >= 0) {
- int object = kdistribution->mesh_light.object_id;
- int shader_flag = kdistribution->mesh_light.shader_flag;
-
- triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
- ls->shader |= shader_flag;
- return (ls->pdf > 0.0f);
+ /* Sample light index from distribution. */
+ const int index = light_distribution_sample(kg, &randu);
+ const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
+ index);
+ const int prim = kdistribution->prim;
+
+ if (prim >= 0) {
+ /* Mesh light. */
+ const int object = kdistribution->mesh_light.object_id;
+
+ /* Exclude synthetic meshes from shadow catcher pass. */
+ if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+ !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+ return false;
}
- lamp = -prim - 1;
+ const int shader_flag = kdistribution->mesh_light.shader_flag;
+ triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
+ ls->shader |= shader_flag;
+ return (ls->pdf > 0.0f);
}
+ const int lamp = -prim - 1;
+
if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
return false;
}
- return lamp_light_sample(kg, lamp, randu, randv, P, ls);
+ return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg,
+ float randu,
+ const float randv,
+ const float time,
+ const float3 P,
+ const int bounce,
+ const int path_flag,
+ LightSample *ls)
+{
+ return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg,
+ float randu,
+ const float randv,
+ const float time,
+ const float3 P,
+ const int bounce,
+ const int path_flag,
+ LightSample *ls)
+{
+ return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
}
-ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg,
+ const float randu,
+ const float randv,
+ const float time,
+ const float3 P,
+ LightSample *ls)
{
- return kernel_tex_fetch(__lights, index).samples;
+ /* Sample a new position on the same light, for volume sampling. */
+ if (ls->type == LIGHT_TRIANGLE) {
+ triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
+ return (ls->pdf > 0.0f);
+ }
+ else {
+ return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
index f0f64ce8704..493ed560bc6 100644
--- a/intern/cycles/kernel/kernel_light_background.h
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#pragma once
+
#include "kernel_light_common.h"
CCL_NAMESPACE_BEGIN
@@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN
#ifdef __BACKGROUND_MIS__
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(const KernelGlobals *kg,
+ float randu,
+ float randv,
+ float *pdf)
{
/* for the following, the CDF values are actually a pair of floats, with the
* function value as X and the actual CDF as Y. The last entry's function
@@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra
/* TODO(sergey): Same as above, after the release we should consider using
* 'noinline' for all devices.
*/
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction)
{
float2 uv = direction_to_equirectangular(direction);
int res_x = kernel_data.background.map_res_x;
@@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
}
ccl_device_inline bool background_portal_data_fetch_and_check_side(
- KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+ const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
{
int portal = kernel_data.background.portal_offset + index;
const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
@@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
}
ccl_device_inline float background_portal_pdf(
- KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+ const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
{
float portal_pdf = 0.0f;
@@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf(
return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
}
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P)
{
int num_possible_portals = 0;
for (int p = 0; p < kernel_data.background.num_portals; p++) {
@@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
return num_possible_portals;
}
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
+ccl_device float3 background_portal_sample(const KernelGlobals *kg,
float3 P,
float randu,
float randv,
@@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
return zero_float3();
}
-ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg,
float randu,
float randv,
float *pdf)
@@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
return D;
}
-ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D)
{
const float3 N = float4_to_float3(kernel_data.background.sun);
const float angle = kernel_data.background.sun.w;
@@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
}
ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
{
float portal_method_pdf = kernel_data.background.portal_weight;
float sun_method_pdf = kernel_data.background.sun_weight;
@@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f
return D;
}
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction)
{
float portal_method_pdf = kernel_data.background.portal_weight;
float sun_method_pdf = kernel_data.background.sun_weight;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 4a683d36226..765d8f5338e 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -14,6 +14,10 @@
* limitations under the License.
*/
+#pragma once
+
+#include "kernel_montecarlo.h"
+
CCL_NAMESPACE_BEGIN
/* Area light sampling */
@@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P,
return true;
}
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t)
{
float cos_pi = dot(Ng, I);
diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h
new file mode 100644
index 00000000000..33d9d5ae1f0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lookup_table.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Interpolated lookup table access */
+
+ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size)
+{
+ x = saturate(x) * (size - 1);
+
+ int index = min(float_to_int(x), size - 1);
+ int nindex = min(index + 1, size - 1);
+ float t = x - index;
+
+ float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+ if (t == 0.0f)
+ return data0;
+
+ float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+ return (1.0f - t) * data0 + t * data1;
+}
+
+ccl_device float lookup_table_read_2D(
+ const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+{
+ y = saturate(y) * (ysize - 1);
+
+ int index = min(float_to_int(y), ysize - 1);
+ int nindex = min(index + 1, ysize - 1);
+ float t = y - index;
+
+ float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
+ if (t == 0.0f)
+ return data0;
+
+ float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
+ return (1.0f - t) * data0 + t * data1;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..3c5ab95bbc8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_MATH_H__
-#define __KERNEL_MATH_H__
+#pragma once
#include "util/util_color.h"
#include "util/util_math.h"
@@ -24,5 +23,3 @@
#include "util/util_projection.h"
#include "util/util_texture.h"
#include "util/util_transform.h"
-
-#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ce37bd0b15e..b158f4c4fd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __KERNEL_MONTECARLO_CL__
-#define __KERNEL_MONTECARLO_CL__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
}
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8f58b8c3079..67466b28170 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,61 +14,52 @@
* limitations under the License.
*/
+#pragma once
+
+#include "kernel/geom/geom.h"
+
#include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_write_passes.h"
CCL_NAMESPACE_BEGIN
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
- ccl_global float *buffer,
- int sample,
- float path_total,
- float path_total_shaded)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+ INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
{
- if (kernel_data.film.pass_denoising_data == 0)
- return;
-
- buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
- DENOISING_PASS_SHADOW_B :
- DENOISING_PASS_SHADOW_A;
-
- path_total = ensure_finite(path_total);
- path_total_shaded = ensure_finite(path_total_shaded);
-
- kernel_write_pass_float(buffer, path_total);
- kernel_write_pass_float(buffer + 1, path_total_shaded);
-
- float value = path_total_shaded / max(path_total, 1e-7f);
- kernel_write_pass_float(buffer + 2, value * value);
+ const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+ kernel_data.film.pass_stride;
+ return render_buffer + render_buffer_offset;
}
-ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- PathRadiance *L)
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+ INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
{
- if (state->denoising_feature_weight == 0.0f) {
+ if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) {
return;
}
- L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
-
/* Skip implicitly transparent surfaces. */
if (sd->flag & SD_HAS_ONLY_VOLUME) {
return;
}
+ ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
float3 normal = zero_float3();
float3 diffuse_albedo = zero_float3();
float3 specular_albedo = zero_float3();
float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
- if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
continue;
+ }
/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
normal += sc->N * sc->sample_weight;
@@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
normal /= sum_weight;
}
- /* Transform normal into camera space. */
- const Transform worldtocamera = kernel_data.cam.worldtocamera;
- normal = transform_direction(&worldtocamera, normal);
+ if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+ /* Transform normal into camera space. */
+ const Transform worldtocamera = kernel_data.cam.worldtocamera;
+ normal = transform_direction(&worldtocamera, normal);
+
+ const float3 denoising_normal = ensure_finite3(normal);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+ }
- L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
- L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
- state->denoising_feature_throughput * diffuse_albedo);
+ if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+ const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+ denoising_feature_throughput);
+ const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+ diffuse_albedo);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+ }
- state->denoising_feature_weight = 0.0f;
+ INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
}
else {
- state->denoising_feature_throughput *= specular_albedo;
+ INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo;
+ }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS,
+ const float3 albedo,
+ const bool scatter,
+ ccl_global float *ccl_restrict
+ render_buffer)
+{
+ ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+ const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput);
+
+ if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+ /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+ INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+ /* Write view direction as normal. */
+ const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+ }
+
+ if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+ /* Write albedo. */
+ const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
}
}
#endif /* __DENOISING_FEATURES__ */
-#ifdef __KERNEL_CPU__
-# define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
- kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
-ccl_device_inline size_t kernel_write_id_pass_cpu(
- float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+ INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
{
- if (map) {
- (*map)[id] += matte_weight;
- return 0;
+ if (!kernel_data.integrator.has_shadow_catcher) {
+ return;
+ }
+
+ kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+ kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+ if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) {
+ return;
}
-#else /* __KERNEL_CPU__ */
-# define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
- kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight)
-ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer,
- size_t depth,
- float id,
- float matte_weight)
+
+ ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
+ /* Count sample for the shadow catcher object. */
+ kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+ /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+ * transparency to the matte. */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+ average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer,
+ size_t depth,
+ float id,
+ float matte_weight)
{
-#endif /* __KERNEL_CPU__ */
- kernel_write_id_slots(buffer, depth, id, matte_weight);
- return depth * 2;
+ kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+ return depth * 4;
}
-ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
- ccl_global float *buffer,
- PathRadiance *L,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- float3 throughput)
+ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS,
+ const ShaderData *sd,
+ ccl_global float *ccl_restrict render_buffer)
{
#ifdef __PASSES__
- int path_flag = state->flag;
+ const int path_flag = INTEGRATOR_STATE(path, flag);
- if (!(path_flag & PATH_RAY_CAMERA))
+ if (!(path_flag & PATH_RAY_CAMERA)) {
return;
+ }
- int flag = kernel_data.film.pass_flag;
- int light_flag = kernel_data.film.light_pass_flag;
+ const int flag = kernel_data.film.pass_flag;
- if (!((flag | light_flag) & PASS_ANY))
+ if (!(flag & PASS_ANY)) {
return;
+ }
+
+ ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
- if (state->sample == 0) {
+ if (INTEGRATOR_STATE(path, sample) == 0) {
if (flag & PASSMASK(DEPTH)) {
- float depth = camera_z_depth(kg, sd->P);
+ const float depth = camera_z_depth(kg, sd->P);
kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
}
if (flag & PASSMASK(OBJECT_ID)) {
- float id = object_pass_id(kg, sd->object);
+ const float id = object_pass_id(kg, sd->object);
kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
}
if (flag & PASSMASK(MATERIAL_ID)) {
- float id = shader_pass_id(kg, sd);
+ const float id = shader_pass_id(kg, sd);
kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
}
}
+ if (flag & PASSMASK(POSITION)) {
+ const float3 position = sd->P;
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+ }
if (flag & PASSMASK(NORMAL)) {
- float3 normal = shader_bsdf_average_normal(kg, sd);
+ const float3 normal = shader_bsdf_average_normal(kg, sd);
kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
}
+ if (flag & PASSMASK(ROUGHNESS)) {
+ const float roughness = shader_bsdf_average_roughness(sd);
+ kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+ }
if (flag & PASSMASK(UV)) {
- float3 uv = primitive_uv(kg, sd);
+ const float3 uv = primitive_uv(kg, sd);
kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
}
if (flag & PASSMASK(MOTION)) {
- float4 speed = primitive_motion_vector(kg, sd);
+ const float4 speed = primitive_motion_vector(kg, sd);
kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
}
- state->flag |= PATH_RAY_SINGLE_PASS_DONE;
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
}
}
if (kernel_data.film.cryptomatte_passes) {
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
const float matte_weight = average(throughput) *
(1.0f - average(shader_bsdf_transparency(kg, sd)));
if (matte_weight > 0.0f) {
ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
- float id = object_cryptomatte_id(kg, sd->object);
- cryptomatte_buffer += WRITE_ID_SLOT(
- cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+ const float id = object_cryptomatte_id(kg, sd->object);
+ cryptomatte_buffer += kernel_write_id_pass(
+ cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
}
if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
- float id = shader_cryptomatte_id(kg, sd->shader);
- cryptomatte_buffer += WRITE_ID_SLOT(
- cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+ const float id = shader_cryptomatte_id(kg, sd->shader);
+ cryptomatte_buffer += kernel_write_id_pass(
+ cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
}
if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
- float id = object_cryptomatte_asset_id(kg, sd->object);
- cryptomatte_buffer += WRITE_ID_SLOT(
- cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+ const float id = object_cryptomatte_asset_id(kg, sd->object);
+ cryptomatte_buffer += kernel_write_id_pass(
+ cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
}
}
}
- if (light_flag & PASSMASK_COMPONENT(DIFFUSE))
- L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput;
- if (light_flag & PASSMASK_COMPONENT(GLOSSY))
- L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
- if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
- L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-
- if (light_flag & PASSMASK(MIST)) {
- /* bring depth into 0..1 range */
- float mist_start = kernel_data.film.mist_start;
- float mist_inv_depth = kernel_data.film.mist_inv_depth;
+ if (flag & PASSMASK(DIFFUSE_COLOR)) {
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+ shader_bsdf_diffuse(kg, sd) * throughput);
+ }
+ if (flag & PASSMASK(GLOSSY_COLOR)) {
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+ shader_bsdf_glossy(kg, sd) * throughput);
+ }
+ if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+ shader_bsdf_transmission(kg, sd) * throughput);
+ }
+ if (flag & PASSMASK(MIST)) {
+ /* Bring depth into 0..1 range. */
+ const float mist_start = kernel_data.film.mist_start;
+ const float mist_inv_depth = kernel_data.film.mist_inv_depth;
- float depth = camera_distance(kg, sd->P);
+ const float depth = camera_distance(kg, sd->P);
float mist = saturate((depth - mist_start) * mist_inv_depth);
- /* falloff */
- float mist_falloff = kernel_data.film.mist_falloff;
+ /* Falloff */
+ const float mist_falloff = kernel_data.film.mist_falloff;
if (mist_falloff == 1.0f)
;
@@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
else
mist = powf(mist, mist_falloff);
- /* modulate by transparency */
- float3 alpha = shader_bsdf_alpha(kg, sd);
- L->mist += (1.0f - mist) * average(throughput * alpha);
- }
-#endif
-}
+ /* Modulate by transparency */
+ const float3 throughput = INTEGRATOR_STATE(path, throughput);
+ const float3 alpha = shader_bsdf_alpha(kg, sd);
+ const float mist_output = (1.0f - mist) * average(throughput * alpha);
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
- ccl_global float *buffer,
- PathRadiance *L)
-{
-#ifdef __PASSES__
- int light_flag = kernel_data.film.light_pass_flag;
-
- if (!kernel_data.film.use_light_pass)
- return;
-
- if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
- if (light_flag & PASSMASK(GLOSSY_INDIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
- if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
- L->indirect_transmission);
- if (light_flag & PASSMASK(VOLUME_INDIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
- if (light_flag & PASSMASK(DIFFUSE_DIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
- if (light_flag & PASSMASK(GLOSSY_DIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
- if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
- L->direct_transmission);
- if (light_flag & PASSMASK(VOLUME_DIRECT))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
-
- if (light_flag & PASSMASK(EMISSION))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
- if (light_flag & PASSMASK(BACKGROUND))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
- if (light_flag & PASSMASK(AO))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
-
- if (light_flag & PASSMASK(DIFFUSE_COLOR))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
- if (light_flag & PASSMASK(GLOSSY_COLOR))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
- if (light_flag & PASSMASK(TRANSMISSION_COLOR))
- kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
- L->color_transmission);
- if (light_flag & PASSMASK(SHADOW)) {
- float3 shadow = L->shadow;
- kernel_write_pass_float4(
- buffer + kernel_data.film.pass_shadow,
- make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale));
+ /* Note that the final value in the render buffer we want is 1 - mist_output,
+ * to avoid having to tracking this in the Integrator state we do the negation
+ * after rendering. */
+ kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
}
- if (light_flag & PASSMASK(MIST))
- kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
#endif
}
-ccl_device_inline void kernel_write_result(KernelGlobals *kg,
- ccl_global float *buffer,
- int sample,
- PathRadiance *L)
-{
- PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
- PROFILING_OBJECT(PRIM_NONE);
-
- float alpha;
- float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
-
- if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
- kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
- }
-
- kernel_write_light_passes(kg, buffer, L);
-
-#ifdef __DENOISING_FEATURES__
- if (kernel_data.film.pass_denoising_data) {
-# ifdef __SHADOW_TRICKS__
- kernel_write_denoising_shadow(kg,
- buffer + kernel_data.film.pass_denoising_data,
- sample,
- average(L->path_total),
- average(L->path_total_shaded));
-# else
- kernel_write_denoising_shadow(
- kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
-# endif
- if (kernel_data.film.pass_denoising_clean) {
- float3 noisy, clean;
- path_radiance_split_denoising(kg, L, &noisy, &clean);
- kernel_write_pass_float3_variance(
- buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy);
- kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean);
- }
- else {
- kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
- DENOISING_PASS_COLOR,
- ensure_finite3(L_sum));
- }
-
- kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
- DENOISING_PASS_NORMAL,
- L->denoising_normal);
- kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
- DENOISING_PASS_ALBEDO,
- L->denoising_albedo);
- kernel_write_pass_float_variance(
- buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth);
- }
-#endif /* __DENOISING_FEATURES__ */
-
- /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
- criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
- Carlo global illumination" except that here it is applied per pixel and not in hierarchical
- tiles. */
- if (kernel_data.film.pass_adaptive_aux_buffer &&
- kernel_data.integrator.adaptive_threshold > 0.0f) {
- if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
- kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
- make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
- }
-#ifdef __KERNEL_CPU__
- if ((sample > kernel_data.integrator.adaptive_min_samples) &&
- kernel_data.integrator.adaptive_stop_per_sample) {
- const int step = kernel_data.integrator.adaptive_step;
-
- if ((sample & (step - 1)) == (step - 1)) {
- kernel_do_adaptive_stopping(kg, buffer, sample);
- }
- }
-#endif
- }
-
- /* Write the sample count as negative numbers initially to mark the samples as in progress.
- * Once the tile has finished rendering, the sign gets flipped and all the pixel values
- * are scaled as if they were taken at a uniform sample count. */
- if (kernel_data.film.pass_sample_count) {
- /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
- * passes. */
-#ifdef __ATOMIC_PASS_WRITE__
- atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
- 0x80000000);
-#else
- if (buffer[kernel_data.film.pass_sample_count] > 0) {
- buffer[kernel_data.film.pass_sample_count] *= -1.0f;
- }
-#endif
- kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
- }
-}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
deleted file mode 100644
index 92a097de9e1..00000000000
--- a/intern/cycles/kernel/kernel_path.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __OSL__
-# include "kernel/osl/osl_shader.h"
-#endif
-
-// clang-format off
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_passes.h"
-
-#if defined(__VOLUME__) || defined(__SUBSURFACE__)
-# include "kernel/kernel_volume.h"
-#endif
-
-#ifdef __SUBSURFACE__
-# include "kernel/kernel_subsurface.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- Intersection *isect,
- PathRadiance *L,
- const int last_object)
-{
- PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
-
- uint visibility = path_state_ray_visibility(kg, state);
-
- if (path_state_ao_bounce(kg, state)) {
- ray->t = kernel_data.background.ao_distance;
- if (last_object != OBJECT_NONE) {
- const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
- if (object_ao_distance != 0.0f) {
- ray->t = object_ao_distance;
- }
- }
- }
-
- bool hit = scene_intersect(kg, ray, visibility, isect);
-
- return hit;
-}
-
-ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- float3 throughput,
- ccl_addr_space Intersection *isect,
- ShaderData *emission_sd,
- PathRadiance *L)
-{
- PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
-
-#ifdef __LAMP_MIS__
- if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
- /* ray starting from previous non-transparent bounce */
- Ray light_ray ccl_optional_struct_init;
-
- light_ray.P = ray->P - state->ray_t * ray->D;
- state->ray_t += isect->t;
- light_ray.D = ray->D;
- light_ray.t = state->ray_t;
- light_ray.time = ray->time;
- light_ray.dD = ray->dD;
- light_ray.dP = ray->dP;
-
- /* intersect with lamp */
- indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput);
- }
-#endif /* __LAMP_MIS__ */
-}
-
-ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- ccl_addr_space Ray *ray,
- float3 throughput,
- ShaderData *sd,
- ccl_global float *buffer,
- PathRadiance *L)
-{
- /* eval background shader if nothing hit */
- if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
- L->transparent += average(throughput);
-
-#ifdef __PASSES__
- if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
-#endif /* __PASSES__ */
- return;
- }
-
- /* When using the ao bounces approximation, adjust background
- * shader intensity with ao factor. */
- if (path_state_ao_bounce(kg, state)) {
- throughput *= kernel_data.background.ao_bounces_factor;
- }
-
-#ifdef __BACKGROUND__
- /* sample background shader */
- float3 L_background = indirect_background(kg, sd, state, buffer, ray);
- path_radiance_accum_background(kg, L, state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-}
-
-#ifndef __SPLIT_KERNEL__
-
-# ifdef __VOLUME__
-ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg,
- ShaderData *sd,
- PathState *state,
- Ray *ray,
- float3 *throughput,
- ccl_addr_space Intersection *isect,
- bool hit,
- ShaderData *emission_sd,
- PathRadiance *L)
-{
- PROFILING_INIT(kg, PROFILING_VOLUME);
-
- /* Sanitize volume stack. */
- if (!hit) {
- kernel_volume_clean_stack(kg, state->volume_stack);
- }
-
- if (state->volume_stack[0].shader == SHADER_NONE) {
- return VOLUME_PATH_ATTENUATED;
- }
-
- /* volume attenuation, emission, scatter */
- Ray volume_ray = *ray;
- volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
- float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-# ifdef __VOLUME_DECOUPLED__
- int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
- bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
- bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
-
- if (decoupled) {
- /* cache steps along volume for repeated sampling */
- VolumeSegment volume_segment;
-
- shader_setup_from_volume(kg, sd, &volume_ray);
- kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
- volume_segment.sampling_method = sampling_method;
-
- /* emission */
- if (volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-
- /* scattering */
- VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
- if (volume_segment.closure_flag & SD_SCATTER) {
- int all = kernel_data.integrator.sample_all_lights_indirect;
-
- /* direct light sampling */
- kernel_branched_path_volume_connect_light(
- kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
- /* indirect sample. if we use distance sampling and take just
- * one sample for direct and indirect light, we could share
- * this computation, but makes code a bit complex */
- float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
- float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
- result = kernel_volume_decoupled_scatter(
- kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true);
- }
-
- /* free cached steps */
- kernel_volume_decoupled_free(kg, &volume_segment);
-
- if (result == VOLUME_PATH_SCATTERED) {
- if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
- return VOLUME_PATH_SCATTERED;
- else
- return VOLUME_PATH_MISSED;
- }
- else {
- *throughput *= volume_segment.accum_transmittance;
- }
- }
- else
-# endif /* __VOLUME_DECOUPLED__ */
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, throughput, step_size);
-
-# ifdef __VOLUME_SCATTER__
- if (result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
- /* indirect light bounce */
- if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
- return VOLUME_PATH_SCATTERED;
- else
- return VOLUME_PATH_MISSED;
- }
-# endif /* __VOLUME_SCATTER__ */
- }
-
- return VOLUME_PATH_ATTENUATED;
-}
-# endif /* __VOLUME__ */
-
-#endif /* __SPLIT_KERNEL__ */
-
-ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ccl_addr_space Ray *ray,
- float3 throughput,
- ShaderData *emission_sd,
- PathRadiance *L,
- ccl_global float *buffer)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
-
-#ifdef __SHADOW_TRICKS__
- if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) {
- if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO);
-
- float3 bg = zero_float3();
- if (!kernel_data.background.transparent) {
- bg = indirect_background(kg, emission_sd, state, NULL, ray);
- }
- path_radiance_accum_shadowcatcher(L, throughput, bg);
- }
- }
- else if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- /* Only update transparency after shadow catcher bounce. */
- L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd));
- }
-#endif /* __SHADOW_TRICKS__ */
-
- /* holdout */
-#ifdef __HOLDOUT__
- if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
- (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
- const float3 holdout_weight = shader_holdout_apply(kg, sd);
- if (kernel_data.background.transparent) {
- L->transparent += average(holdout_weight * throughput);
- }
- if (isequal_float3(holdout_weight, one_float3())) {
- return false;
- }
- }
-#endif /* __HOLDOUT__ */
-
- /* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
-
- /* blurring of bsdf after bounces, for rays that have a small likelihood
- * of following this particular path (diffuse, rough glossy) */
- if (kernel_data.integrator.filter_glossy != FLT_MAX) {
- float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf;
-
- if (blur_pdf < 1.0f) {
- float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
- shader_bsdf_blur(kg, sd, blur_roughness);
- }
- }
-
-#ifdef __EMISSION__
- /* emission */
- if (sd->flag & SD_EMISSION) {
- float3 emission = indirect_primitive_emission(
- kg, sd, sd->ray_length, state->flag, state->ray_pdf);
- path_radiance_accum_emission(kg, L, state, throughput, emission);
- }
-#endif /* __EMISSION__ */
-
- return true;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
- void
- kernel_path_ao(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput,
- float3 ao_alpha)
-{
- PROFILING_INIT(kg, PROFILING_AO);
-
- /* todo: solve correlation */
- float bsdf_u, bsdf_v;
-
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- float3 ao_D;
- float ao_pdf;
-
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray light_ray;
- float3 ao_shadow;
-
- light_ray.P = ray_offset(sd->P, sd->Ng);
- light_ray.D = ao_D;
- light_ray.t = kernel_data.background.ao_distance;
- light_ray.time = sd->time;
- light_ray.dP = sd->dP;
- light_ray.dD = differential3_zero();
-
- if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
- }
- else {
- path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
- }
- }
-}
-
-#ifndef __SPLIT_KERNEL__
-
-# if defined(__BRANCHED_PATH__) || defined(__BAKING__)
-
-ccl_device void kernel_path_indirect(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- Ray *ray,
- float3 throughput,
- PathState *state,
- PathRadiance *L,
- const int last_object)
-{
-# ifdef __SUBSURFACE__
- SubsurfaceIndirectRays ss_indirect;
- kernel_path_subsurface_init_indirect(&ss_indirect);
-
- for (;;) {
-# endif /* __SUBSURFACE__ */
-
- /* path iteration */
- for (;;) {
- /* Find intersection with objects in scene. */
- Intersection isect;
- bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object);
-
- /* Find intersection with lamps and compute emission for MIS. */
- kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
-
-# ifdef __VOLUME__
- /* Volume integration. */
- VolumeIntegrateResult result = kernel_path_volume(
- kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
- if (result == VOLUME_PATH_SCATTERED) {
- continue;
- }
- else if (result == VOLUME_PATH_MISSED) {
- break;
- }
-# endif /* __VOLUME__*/
-
- /* Shade background. */
- if (!hit) {
- kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
- break;
- }
- else if (path_state_ao_bounce(kg, state)) {
- if (intersection_get_shader_flags(kg, &isect) &
- (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
- }
- else {
- break;
- }
- }
-
- /* Setup shader data. */
- shader_setup_from_ray(kg, sd, &isect, ray);
-
- /* Skip most work for volume bounding surface. */
-# ifdef __VOLUME__
- if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-# endif
-
- /* Evaluate shader. */
- shader_eval_surface(kg, sd, state, NULL, state->flag);
- shader_prepare_closures(sd, state);
-
- /* Apply shadow catcher, holdout, emission. */
- if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) {
- break;
- }
-
- /* path termination. this is a strange place to put the termination, it's
- * mainly due to the mixed in MIS that we use. gives too many unneeded
- * shader evaluations, only need emission if we are going to terminate */
- float probability = path_state_continuation_probability(kg, state, throughput);
-
- if (probability == 0.0f) {
- break;
- }
- else if (probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-
- if (terminate >= probability)
- break;
-
- throughput /= probability;
- }
-
-# ifdef __DENOISING_FEATURES__
- kernel_update_denoising_features(kg, sd, state, L);
-# endif
-
-# ifdef __AO__
- /* ambient occlusion */
- if (kernel_data.integrator.use_ambient_occlusion) {
- kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3());
- }
-# endif /* __AO__ */
-
-# ifdef __SUBSURFACE__
- /* bssrdf scatter to a different location on the same object, replacing
- * the closures with a diffuse BSDF */
- if (sd->flag & SD_BSSRDF) {
- if (kernel_path_subsurface_scatter(
- kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
- break;
- }
- }
-# endif /* __SUBSURFACE__ */
-
-# if defined(__EMISSION__)
- int all = (kernel_data.integrator.sample_all_lights_indirect) ||
- (state->flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(
- kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-# endif /* defined(__EMISSION__) */
-
-# ifdef __VOLUME__
- }
-# endif
-
- if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
- break;
- }
-
-# ifdef __SUBSURFACE__
- /* Trace indirect subsurface rays by restarting the loop. this uses less
- * stack memory than invoking kernel_path_indirect.
- */
- if (ss_indirect.num_rays) {
- kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
- }
- else {
- break;
- }
- }
-# endif /* __SUBSURFACE__ */
-}
-
-# endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-
-ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
- PathState *state,
- float3 throughput,
- Ray *ray,
- PathRadiance *L,
- ccl_global float *buffer,
- ShaderData *emission_sd)
-{
- PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
-
- /* Shader data memory used for both volumes and surfaces, saves stack space. */
- ShaderData sd;
-
-# ifdef __SUBSURFACE__
- SubsurfaceIndirectRays ss_indirect;
- kernel_path_subsurface_init_indirect(&ss_indirect);
-
- for (;;) {
-# endif /* __SUBSURFACE__ */
-
- /* path iteration */
- for (;;) {
- /* Find intersection with objects in scene. */
- Intersection isect;
- bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object);
-
- /* Find intersection with lamps and compute emission for MIS. */
- kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
-
-# ifdef __VOLUME__
- /* Volume integration. */
- VolumeIntegrateResult result = kernel_path_volume(
- kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
- if (result == VOLUME_PATH_SCATTERED) {
- continue;
- }
- else if (result == VOLUME_PATH_MISSED) {
- break;
- }
-# endif /* __VOLUME__*/
-
- /* Shade background. */
- if (!hit) {
- kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
- break;
- }
- else if (path_state_ao_bounce(kg, state)) {
- if (intersection_get_shader_flags(kg, &isect) &
- (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
- }
- else {
- break;
- }
- }
-
- /* Setup shader data. */
- shader_setup_from_ray(kg, &sd, &isect, ray);
-
- /* Skip most work for volume bounding surface. */
-# ifdef __VOLUME__
- if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-# endif
-
- /* Evaluate shader. */
- shader_eval_surface(kg, &sd, state, buffer, state->flag);
- shader_prepare_closures(&sd, state);
-
- /* Apply shadow catcher, holdout, emission. */
- if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) {
- break;
- }
-
- /* path termination. this is a strange place to put the termination, it's
- * mainly due to the mixed in MIS that we use. gives too many unneeded
- * shader evaluations, only need emission if we are going to terminate */
- float probability = path_state_continuation_probability(kg, state, throughput);
-
- if (probability == 0.0f) {
- break;
- }
- else if (probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
- if (terminate >= probability)
- break;
-
- throughput /= probability;
- }
-
-# ifdef __DENOISING_FEATURES__
- kernel_update_denoising_features(kg, &sd, state, L);
-# endif
-
-# ifdef __AO__
- /* ambient occlusion */
- if (kernel_data.integrator.use_ambient_occlusion) {
- kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
- }
-# endif /* __AO__ */
-
-# ifdef __SUBSURFACE__
- /* bssrdf scatter to a different location on the same object, replacing
- * the closures with a diffuse BSDF */
- if (sd.flag & SD_BSSRDF) {
- if (kernel_path_subsurface_scatter(
- kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
- break;
- }
- }
-# endif /* __SUBSURFACE__ */
-
-# ifdef __EMISSION__
- /* direct lighting */
- kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
-# endif /* __EMISSION__ */
-
-# ifdef __VOLUME__
- }
-# endif
-
- /* compute direct lighting and next bounce */
- if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
- break;
- }
-
-# ifdef __SUBSURFACE__
- /* Trace indirect subsurface rays by restarting the loop. this uses less
- * stack memory than invoking kernel_path_indirect.
- */
- if (ss_indirect.num_rays) {
- kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
- }
- else {
- break;
- }
- }
-# endif /* __SUBSURFACE__ */
-}
-
-ccl_device void kernel_path_trace(
- KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
- PROFILING_INIT(kg, PROFILING_RAY_SETUP);
-
- /* buffer offset */
- int index = offset + x + y * stride;
- int pass_stride = kernel_data.film.pass_stride;
-
- buffer += index * pass_stride;
-
- if (kernel_data.film.pass_adaptive_aux_buffer) {
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w > 0.0f) {
- return;
- }
- }
-
- /* Initialize random numbers and sample ray. */
- uint rng_hash;
- Ray ray;
-
- kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
- if (ray.t == 0.0f) {
- return;
- }
-
- /* Initialize state. */
- float3 throughput = one_float3();
-
- PathRadiance L;
- path_radiance_init(kg, &L);
-
- ShaderDataTinyStorage emission_sd_storage;
- ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
- PathState state;
- path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-# ifdef __KERNEL_OPTIX__
- /* Force struct into local memory to avoid costly spilling on trace calls. */
- if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
- for (int i = 0; i < sizeof(L); ++i)
- reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
-# endif
-
- /* Integrate. */
- kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
-
- kernel_write_result(kg, buffer, sample, &L);
-}
-
-#endif /* __SPLIT_KERNEL__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
deleted file mode 100644
index a1ee1bc107e..00000000000
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- float3 throughput)
-{
- int num_samples = kernel_data.integrator.ao_samples;
- float num_samples_inv = 1.0f / num_samples;
- float ao_factor = kernel_data.background.ao_factor;
- float3 ao_N;
- float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
- for (int j = 0; j < num_samples; j++) {
- float bsdf_u, bsdf_v;
- path_branched_rng_2D(
- kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
- float3 ao_D;
- float ao_pdf;
-
- sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
- if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
- Ray light_ray;
- float3 ao_shadow;
-
- light_ray.P = ray_offset(sd->P, sd->Ng);
- light_ray.D = ao_D;
- light_ray.t = kernel_data.background.ao_distance;
- light_ray.time = sd->time;
- light_ray.dP = sd->dP;
- light_ray.dD = differential3_zero();
-
- if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(
- kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
- }
- else {
- path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
- }
- }
- }
-}
-
-# ifndef __SPLIT_KERNEL__
-
-# ifdef __VOLUME__
-ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
- ShaderData *sd,
- PathState *state,
- Ray *ray,
- float3 *throughput,
- ccl_addr_space Intersection *isect,
- bool hit,
- ShaderData *indirect_sd,
- ShaderData *emission_sd,
- PathRadiance *L)
-{
- /* Sanitize volume stack. */
- if (!hit) {
- kernel_volume_clean_stack(kg, state->volume_stack);
- }
-
- if (state->volume_stack[0].shader == SHADER_NONE) {
- return;
- }
-
- /* volume attenuation, emission, scatter */
- Ray volume_ray = *ray;
- volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
- float step_size = volume_stack_step_size(kg, state->volume_stack);
- const int object = sd->object;
-
-# ifdef __VOLUME_DECOUPLED__
- /* decoupled ray marching only supported on CPU */
- if (kernel_data.integrator.volume_decoupled) {
- /* cache steps along volume for repeated sampling */
- VolumeSegment volume_segment;
-
- shader_setup_from_volume(kg, sd, &volume_ray);
- kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
- /* direct light sampling */
- if (volume_segment.closure_flag & SD_SCATTER) {
- volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-
- int all = kernel_data.integrator.sample_all_lights_direct;
-
- kernel_branched_path_volume_connect_light(
- kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
- /* indirect light sampling */
- int num_samples = kernel_data.integrator.volume_samples;
- float num_samples_inv = 1.0f / num_samples;
-
- for (int j = 0; j < num_samples; j++) {
- PathState ps = *state;
- Ray pray = *ray;
- float3 tp = *throughput;
-
- /* branch RNG state */
- path_state_branch(&ps, j, num_samples);
-
- /* scatter sample. if we use distance sampling and take just one
- * sample for direct and indirect light, we could share this
- * computation, but makes code a bit complex */
- float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
- float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
- VolumeIntegrateResult result = kernel_volume_decoupled_scatter(
- kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
- if (result == VOLUME_PATH_SCATTERED &&
- kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
- kernel_path_indirect(
- kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object);
-
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
- }
- }
- }
-
- /* emission and transmittance */
- if (volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
- *throughput *= volume_segment.accum_transmittance;
-
- /* free cached steps */
- kernel_volume_decoupled_free(kg, &volume_segment);
- }
- else
-# endif /* __VOLUME_DECOUPLED__ */
- {
- /* GPU: no decoupled ray marching, scatter probabilistically. */
- int num_samples = kernel_data.integrator.volume_samples;
- float num_samples_inv = 1.0f / num_samples;
-
- /* todo: we should cache the shader evaluations from stepping
- * through the volume, for now we redo them multiple times */
-
- for (int j = 0; j < num_samples; j++) {
- PathState ps = *state;
- Ray pray = *ray;
- float3 tp = (*throughput) * num_samples_inv;
-
- /* branch RNG state */
- path_state_branch(&ps, j, num_samples);
-
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, sd, &volume_ray, L, &tp, step_size);
-
-# ifdef __VOLUME_SCATTER__
- if (result == VOLUME_PATH_SCATTERED) {
- /* todo: support equiangular, MIS and all light sampling.
- * alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
-
- if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
- kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object);
-
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
- }
- }
-# endif /* __VOLUME_SCATTER__ */
- }
-
- /* todo: avoid this calculation using decoupled ray marching */
- kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
- }
-}
-# endif /* __VOLUME__ */
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *indirect_sd,
- ShaderData *emission_sd,
- float3 throughput,
- float num_samples_adjust,
- PathState *state,
- PathRadiance *L)
-{
- float sum_sample_weight = 0.0f;
-# ifdef __DENOISING_FEATURES__
- if (state->denoising_feature_weight > 0.0f) {
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- /* transparency is not handled here, but in outer loop */
- if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
- continue;
- }
-
- sum_sample_weight += sc->sample_weight;
- }
- }
- else {
- sum_sample_weight = 1.0f;
- }
-# endif /* __DENOISING_FEATURES__ */
-
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- /* transparency is not handled here, but in outer loop */
- if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
- continue;
- }
-
- int num_samples;
-
- if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
- num_samples = kernel_data.integrator.diffuse_samples;
- else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
- num_samples = 1;
- else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
- num_samples = kernel_data.integrator.glossy_samples;
- else
- num_samples = kernel_data.integrator.transmission_samples;
-
- num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
- float num_samples_inv = num_samples_adjust / num_samples;
-
- for (int j = 0; j < num_samples; j++) {
- PathState ps = *state;
- float3 tp = throughput;
- Ray bsdf_ray;
-# ifdef __SHADOW_TRICKS__
- float shadow_transparency = L->shadow_transparency;
-# endif
-
- ps.rng_hash = cmj_hash(state->rng_hash, i);
-
- if (!kernel_branched_path_surface_bounce(
- kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) {
- continue;
- }
-
- ps.rng_hash = state->rng_hash;
-
- kernel_path_indirect(
- kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object);
-
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
-
-# ifdef __SHADOW_TRICKS__
- L->shadow_transparency = shadow_transparency;
-# endif
- }
- }
-}
-
-# ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *indirect_sd,
- ShaderData *emission_sd,
- PathRadiance *L,
- PathState *state,
- Ray *ray,
- float3 throughput)
-{
- for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if (!CLOSURE_IS_BSSRDF(sc->type))
- continue;
-
- /* set up random number generator */
- uint lcg_state = lcg_state_init(state, 0x68bc21eb);
- int num_samples = kernel_data.integrator.subsurface_samples * 3;
- float num_samples_inv = 1.0f / num_samples;
- uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
-
- /* do subsurface scatter step with copy of shader data, this will
- * replace the BSSRDF with a diffuse BSDF closure */
- for (int j = 0; j < num_samples; j++) {
- PathState hit_state = *state;
- path_state_branch(&hit_state, j, num_samples);
- hit_state.rng_hash = bssrdf_rng_hash;
-
- LocalIntersection ss_isect;
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
- int num_hits = subsurface_scatter_multi_intersect(
- kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
- hit_state.rng_offset += PRNG_BOUNCE_NUM;
-
-# ifdef __VOLUME__
- Ray volume_ray = *ray;
- bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
- sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-# endif /* __VOLUME__ */
-
- /* compute lighting with the BSDF closure */
- for (int hit = 0; hit < num_hits; hit++) {
- ShaderData bssrdf_sd = *sd;
- Bssrdf *bssrdf = (Bssrdf *)sc;
- ClosureType bssrdf_type = sc->type;
- float bssrdf_roughness = bssrdf->roughness;
- subsurface_scatter_multi_setup(
- kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness);
-
-# ifdef __VOLUME__
- if (need_update_volume_stack) {
- /* Setup ray from previous surface point to the new one. */
- float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
- volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
- for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
- hit_state.volume_stack[k] = state->volume_stack[k];
- }
-
- kernel_volume_stack_update_for_subsurface(
- kg, emission_sd, &volume_ray, hit_state.volume_stack);
- }
-# endif /* __VOLUME__ */
-
-# ifdef __EMISSION__
- /* direct light */
- if (kernel_data.integrator.use_direct_light) {
- int all = (kernel_data.integrator.sample_all_lights_direct) ||
- (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(
- kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all);
- }
-# endif /* __EMISSION__ */
-
- /* indirect light */
- kernel_branched_path_surface_indirect_light(
- kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L);
- }
- }
- }
-}
-# endif /* __SUBSURFACE__ */
-
-ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
- uint rng_hash,
- int sample,
- Ray ray,
- ccl_global float *buffer,
- PathRadiance *L)
-{
- /* initialize */
- float3 throughput = one_float3();
-
- path_radiance_init(kg, L);
-
- /* shader data memory used for both volumes and surfaces, saves stack space */
- ShaderData sd;
- /* shader data used by emission, shadows, volume stacks, indirect path */
- ShaderDataTinyStorage emission_sd_storage;
- ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
- ShaderData indirect_sd;
-
- PathState state;
- path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
- /* Main Loop
- * Here we only handle transparency intersections from the camera ray.
- * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
- */
- for (;;) {
- /* Find intersection with objects in scene. */
- Intersection isect;
- bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object);
-
-# ifdef __VOLUME__
- /* Volume integration. */
- kernel_branched_path_volume(
- kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L);
-# endif /* __VOLUME__ */
-
- /* Shade background. */
- if (!hit) {
- kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
- break;
- }
-
- /* Setup and evaluate shader. */
- shader_setup_from_ray(kg, &sd, &isect, &ray);
-
- /* Skip most work for volume bounding surface. */
-# ifdef __VOLUME__
- if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-# endif
-
- shader_eval_surface(kg, &sd, &state, buffer, state.flag);
- shader_merge_closures(&sd);
-
- /* Apply shadow catcher, holdout, emission. */
- if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) {
- break;
- }
-
- /* transparency termination */
- if (state.flag & PATH_RAY_TRANSPARENT) {
- /* path termination. this is a strange place to put the termination, it's
- * mainly due to the mixed in MIS that we use. gives too many unneeded
- * shader evaluations, only need emission if we are going to terminate */
- float probability = path_state_continuation_probability(kg, &state, throughput);
-
- if (probability == 0.0f) {
- break;
- }
- else if (probability != 1.0f) {
- float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
-
- if (terminate >= probability)
- break;
-
- throughput /= probability;
- }
- }
-
-# ifdef __DENOISING_FEATURES__
- kernel_update_denoising_features(kg, &sd, &state, L);
-# endif
-
-# ifdef __AO__
- /* ambient occlusion */
- if (kernel_data.integrator.use_ambient_occlusion) {
- kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
- }
-# endif /* __AO__ */
-
-# ifdef __SUBSURFACE__
- /* bssrdf scatter to a different location on the same object */
- if (sd.flag & SD_BSSRDF) {
- kernel_branched_path_subsurface_scatter(
- kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
- }
-# endif /* __SUBSURFACE__ */
-
- PathState hit_state = state;
-
-# ifdef __EMISSION__
- /* direct light */
- if (kernel_data.integrator.use_direct_light) {
- int all = (kernel_data.integrator.sample_all_lights_direct) ||
- (state.flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(
- kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
- }
-# endif /* __EMISSION__ */
-
- /* indirect light */
- kernel_branched_path_surface_indirect_light(
- kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
-
- /* continue in case of transparency */
- throughput *= shader_bsdf_transparency(kg, &sd);
-
- if (is_zero(throughput))
- break;
-
- /* Update Path State */
- path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-# ifdef __VOLUME__
- }
- else {
- if (!path_state_volume_next(kg, &state)) {
- break;
- }
- }
-# endif
-
- ray.P = ray_offset(sd.P, -sd.Ng);
- ray.t -= sd.ray_length; /* clipping works through transparent */
-
-# ifdef __RAY_DIFFERENTIALS__
- ray.dP = sd.dP;
- ray.dD.dx = -sd.dI.dx;
- ray.dD.dy = -sd.dI.dy;
-# endif /* __RAY_DIFFERENTIALS__ */
-
-# ifdef __VOLUME__
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-# endif /* __VOLUME__ */
- }
-}
-
-ccl_device void kernel_branched_path_trace(
- KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
- /* buffer offset */
- int index = offset + x + y * stride;
- int pass_stride = kernel_data.film.pass_stride;
-
- buffer += index * pass_stride;
-
- if (kernel_data.film.pass_adaptive_aux_buffer) {
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w > 0.0f) {
- return;
- }
- }
-
- /* initialize random numbers and ray */
- uint rng_hash;
- Ray ray;
-
- kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
- /* integrate */
- PathRadiance L;
-
- if (ray.t != 0.0f) {
- kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
- kernel_write_result(kg, buffer, sample, &L);
- }
-}
-
-# endif /* __SPLIT_KERNEL__ */
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
deleted file mode 100644
index 815767595a9..00000000000
--- a/intern/cycles/kernel/kernel_path_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_hash.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_path_trace_setup(
- KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray)
-{
- float filter_u;
- float filter_v;
-
- int num_samples = kernel_data.integrator.aa_samples;
-
- path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
-
- /* sample camera ray */
-
- float lens_u = 0.0f, lens_v = 0.0f;
-
- if (kernel_data.cam.aperturesize > 0.0f)
- path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
- float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
- if (kernel_data.cam.shuttertime != -1.0f)
- time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
-#endif
-
- camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bf601580cd0..ebb2c0df4f1 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -14,99 +14,116 @@
* limitations under the License.
*/
-CCL_NAMESPACE_BEGIN
+#pragma once
-ccl_device_inline void path_state_init(KernelGlobals *kg,
- ShaderData *stack_sd,
- ccl_addr_space PathState *state,
- uint rng_hash,
- int sample,
- ccl_addr_space Ray *ray)
-{
- state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND;
+#include "kernel_random.h"
- state->rng_hash = rng_hash;
- state->rng_offset = PRNG_BASE_NUM;
- state->sample = sample;
- state->num_samples = kernel_data.integrator.aa_samples;
- state->branch_factor = 1.0f;
+CCL_NAMESPACE_BEGIN
- state->bounce = 0;
- state->diffuse_bounce = 0;
- state->glossy_bounce = 0;
- state->transmission_bounce = 0;
- state->transparent_bounce = 0;
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS)
+{
+ INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
-#ifdef __DENOISING_FEATURES__
- if (kernel_data.film.pass_denoising_data) {
- state->flag |= PATH_RAY_STORE_SHADOW_INFO;
- state->denoising_feature_weight = 1.0f;
- state->denoising_feature_throughput = one_float3();
- }
- else {
- state->denoising_feature_weight = 0.0f;
- state->denoising_feature_throughput = zero_float3();
- }
-#endif /* __DENOISING_FEATURES__ */
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS,
+ const ccl_global KernelWorkTile *ccl_restrict tile,
+ const int x,
+ const int y)
+{
+ const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
- state->min_ray_pdf = FLT_MAX;
- state->ray_pdf = 0.0f;
-#ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-#endif
+ INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index;
-#ifdef __VOLUME__
- state->volume_bounce = 0;
- state->volume_bounds_bounce = 0;
+ path_state_init_queues(INTEGRATOR_STATE_PASS);
+}
- if (kernel_data.integrator.use_volumes) {
- /* Initialize volume stack with volume we are inside of. */
- kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS,
+ const int sample,
+ const uint rng_hash)
+{
+ INTEGRATOR_STATE_WRITE(path, sample) = sample;
+ INTEGRATOR_STATE_WRITE(path, bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0;
+ INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash;
+ INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM;
+ INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+ PATH_RAY_TRANSPARENT_BACKGROUND;
+ INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f;
+ INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+ INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX;
+ INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE;
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader;
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE;
+ INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
}
- else {
- state->volume_stack[0].shader = SHADER_NONE;
+
+#ifdef __DENOISING_FEATURES__
+ if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES;
+ INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3();
}
#endif
}
-ccl_device_inline void path_state_next(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- int label)
+ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label)
{
+ uint32_t flag = INTEGRATOR_STATE(path, flag);
+
/* ray through transparent keeps same flags from previous ray and is
* not counted as a regular bounce, transparent has separate max */
if (label & LABEL_TRANSPARENT) {
- state->flag |= PATH_RAY_TRANSPARENT;
- state->transparent_bounce++;
- if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
- state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+ uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+
+ flag |= PATH_RAY_TRANSPARENT;
+ if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+ flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
}
if (!kernel_data.integrator.transparent_shadows)
- state->flag |= PATH_RAY_MIS_SKIP;
-
- /* random number generator next bounce */
- state->rng_offset += PRNG_BOUNCE_NUM;
+ flag |= PATH_RAY_MIS_SKIP;
+ INTEGRATOR_STATE_WRITE(path, flag) = flag;
+ INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+ /* Random number generator next bounce. */
+ INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
return;
}
- state->bounce++;
- if (state->bounce >= kernel_data.integrator.max_bounce) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1;
+ if (bounce >= kernel_data.integrator.max_bounce) {
+ flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
- state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+ flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
#ifdef __VOLUME__
if (label & LABEL_VOLUME_SCATTER) {
/* volume scatter */
- state->flag |= PATH_RAY_VOLUME_SCATTER;
- state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+ flag |= PATH_RAY_VOLUME_SCATTER;
+ flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+ if (bounce == 1) {
+ flag |= PATH_RAY_VOLUME_PASS;
+ }
- state->volume_bounce++;
- if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce;
+ if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+ flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
}
else
@@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg,
{
/* surface reflection/transmission */
if (label & LABEL_REFLECT) {
- state->flag |= PATH_RAY_REFLECT;
- state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+ flag |= PATH_RAY_REFLECT;
+ flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
if (label & LABEL_DIFFUSE) {
- state->diffuse_bounce++;
- if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce;
+ if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+ flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
}
else {
- state->glossy_bounce++;
- if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce;
+ if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+ flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
}
}
else {
kernel_assert(label & LABEL_TRANSMIT);
- state->flag |= PATH_RAY_TRANSMIT;
+ flag |= PATH_RAY_TRANSMIT;
if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
- state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+ flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
}
- state->transmission_bounce++;
- if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
- state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+ const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce;
+ if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+ flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
}
/* diffuse/glossy/singular */
if (label & LABEL_DIFFUSE) {
- state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+ flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
}
else if (label & LABEL_GLOSSY) {
- state->flag |= PATH_RAY_GLOSSY;
+ flag |= PATH_RAY_GLOSSY;
}
else {
kernel_assert(label & LABEL_SINGULAR);
- state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+ flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+ }
+
+ /* Render pass categories. */
+ if (bounce == 1) {
+ flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
}
}
- /* random number generator next bounce */
- state->rng_offset += PRNG_BOUNCE_NUM;
+ INTEGRATOR_STATE_WRITE(path, flag) = flag;
+ INTEGRATOR_STATE_WRITE(path, bounce) = bounce;
-#ifdef __DENOISING_FEATURES__
- if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
- state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
- }
-#endif
+ /* Random number generator next bounce. */
+ INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
}
#ifdef __VOLUME__
-ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS)
{
/* For volume bounding meshes we pass through without counting transparent
* bounces, only sanity check in case self intersection gets us stuck. */
- state->volume_bounds_bounce++;
- if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+ uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1;
+ INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce;
+ if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
return false;
}
/* Random number generator next bounce. */
- if (state->volume_bounds_bounce > 1) {
- state->rng_offset += PRNG_BOUNCE_NUM;
+ if (volume_bounds_bounce > 1) {
+ INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
}
return true;
}
#endif
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg,
- ccl_addr_space PathState *state)
+ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS)
{
- uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
+ const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
- /* for visibility, diffuse/glossy are for reflection only */
- if (flag & PATH_RAY_TRANSMIT)
- flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
- /* todo: this is not supported as its own ray visibility yet */
- if (state->flag & PATH_RAY_VOLUME_SCATTER)
- flag |= PATH_RAY_DIFFUSE;
+ uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
- return flag;
+ /* For visibility, diffuse/glossy are for reflection only. */
+ if (visibility & PATH_RAY_TRANSMIT) {
+ visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+ }
+
+ /* todo: this is not supported as its own ray visibility yet. */
+ if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+ visibility |= PATH_RAY_DIFFUSE;
+ }
+
+ visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+ return visibility;
}
-ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS,
+ const uint32_t path_flag)
{
- if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
- /* Ray is to be terminated immediately. */
- return 0.0f;
- }
- else if (state->flag & PATH_RAY_TRANSPARENT) {
+ if (path_flag & PATH_RAY_TRANSPARENT) {
+ const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
/* Do at least specified number of bounces without RR. */
- if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
- return 1.0f;
- }
-#ifdef __SHADOW_TRICKS__
- /* Exception for shadow catcher not working correctly with RR. */
- else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+ if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
return 1.0f;
}
-#endif
}
else {
+ const uint32_t bounce = INTEGRATOR_STATE(path, bounce);
/* Do at least specified number of bounces without RR. */
- if (state->bounce <= kernel_data.integrator.min_bounce) {
+ if (bounce <= kernel_data.integrator.min_bounce) {
return 1.0f;
}
-#ifdef __SHADOW_TRICKS__
- /* Exception for shadow catcher not working correctly with RR. */
- else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
- return 1.0f;
- }
-#endif
}
/* Probabilistic termination: use sqrt() to roughly match typical view
* transform and do path termination a bit later on average. */
- return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
+ return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f);
}
-/* TODO(DingTo): Find more meaningful name for this */
-ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase)
+ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS)
{
- /* Modify bounce temporarily for shader eval */
- if (increase)
- state->bounce += 1;
- else
- state->bounce -= 1;
-}
-
-ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
-{
- if (state->bounce <= kernel_data.integrator.ao_bounces) {
+ if (!kernel_data.integrator.ao_bounces) {
return false;
}
- int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+ const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) -
+ (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1;
return (bounce > kernel_data.integrator.ao_bounces);
}
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
- int branch,
- int num_branches)
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+ uint rng_hash;
+ uint rng_offset;
+ int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+ rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+ rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset);
+ rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+ const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) -
+ INTEGRATOR_STATE(path, transparent_bounce);
+
+ rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+ rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces;
+ rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg,
+ const RNGState *rng_state,
+ int dimension)
+{
+ return path_rng_1D(
+ kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(
+ const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy)
+{
+ path_rng_2D(
+ kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg,
+ const RNGState *rng_state,
+ uint hash)
+{
+ /* Use a hash instead of dimension, this is not great but avoids adding
+ * more dimensions to each bounce which reduces quality of dimensions we
+ * are already using. */
+ return path_rng_1D(
+ kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg,
+ const RNGState *rng_state,
+ int branch,
+ int num_branches,
+ int dimension)
+{
+ return path_rng_1D(kg,
+ rng_state->rng_hash,
+ rng_state->sample * num_branches + branch,
+ rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg,
+ const RNGState *rng_state,
+ int branch,
+ int num_branches,
+ int dimension,
+ float *fx,
+ float *fy)
+{
+ path_rng_2D(kg,
+ rng_state->rng_hash,
+ rng_state->sample * num_branches + branch,
+ rng_state->rng_offset + dimension,
+ fx,
+ fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg,
+ const RNGState *state)
{
- if (num_branches > 1) {
- /* Path is splitting into a branch, adjust so that each branch
- * still gets a unique sample from the same sequence. */
- state->sample = state->sample * num_branches + branch;
- state->num_samples = state->num_samples * num_branches;
- state->branch_factor *= num_branches;
+ if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+ return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
}
+ return 0.0f;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
deleted file mode 100644
index 97d3f292ca3..00000000000
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __SUBSURFACE__
-# ifndef __KERNEL_CUDA__
-ccl_device
-# else
-ccl_device_inline
-# endif
- bool
- kernel_path_subsurface_scatter(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- PathRadiance *L,
- ccl_addr_space PathState *state,
- ccl_addr_space Ray *ray,
- ccl_addr_space float3 *throughput,
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
- PROFILING_INIT(kg, PROFILING_SUBSURFACE);
-
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
- const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
-
- /* do bssrdf scatter step if we picked a bssrdf closure */
- if (sc) {
- /* We should never have two consecutive BSSRDF bounces,
- * the second one should be converted to a diffuse BSDF to
- * avoid this.
- */
- kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
-
- uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
-
- LocalIntersection ss_isect;
- int num_hits = subsurface_scatter_multi_intersect(
- kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-# ifdef __VOLUME__
- bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
- sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-# endif /* __VOLUME__ */
-
- /* Closure memory will be overwritten, so read required variables now. */
- Bssrdf *bssrdf = (Bssrdf *)sc;
- ClosureType bssrdf_type = sc->type;
- float bssrdf_roughness = bssrdf->roughness;
-
- /* compute lighting with the BSDF closure */
- for (int hit = 0; hit < num_hits; hit++) {
- /* NOTE: We reuse the existing ShaderData, we assume the path
- * integration loop stops when this function returns true.
- */
- subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness);
-
- kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
- ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
- ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
- ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
- PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
-
- *hit_state = *state;
- *hit_ray = *ray;
- *hit_tp = *throughput;
- *hit_L_state = L->state;
-
- hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
- if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) {
-# ifdef __LAMP_MIS__
- hit_state->ray_t = 0.0f;
-# endif /* __LAMP_MIS__ */
-
-# ifdef __VOLUME__
- if (need_update_volume_stack) {
- Ray volume_ray = *ray;
- /* Setup ray from previous surface point to the new one. */
- volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t);
-
- kernel_volume_stack_update_for_subsurface(
- kg, emission_sd, &volume_ray, hit_state->volume_stack);
- }
-# endif /* __VOLUME__ */
- ss_indirect->num_rays++;
- }
- }
- return true;
- }
- return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
- ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
- KernelGlobals *kg,
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
- ccl_addr_space PathState *state,
- ccl_addr_space Ray *ray,
- PathRadiance *L,
- ccl_addr_space float3 *throughput)
-{
- /* Setup state, ray and throughput for indirect SSS rays. */
- ss_indirect->num_rays--;
-
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
-
- *state = ss_indirect->state[ss_indirect->num_rays];
- *ray = ss_indirect->rays[ss_indirect->num_rays];
- L->state = ss_indirect->L_state[ss_indirect->num_rays];
- *throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
- state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
deleted file mode 100644
index ba48c0bdfc4..00000000000
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
- defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L
- */
-ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
- KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- ccl_addr_space PathState *state,
- float3 throughput,
- float num_samples_adjust,
- PathRadiance *L,
- int sample_all_lights)
-{
-# ifdef __EMISSION__
- /* sample illumination from lights to find path contribution */
- BsdfEval L_light ccl_optional_struct_init;
-
- int num_lights = 0;
- if (kernel_data.integrator.use_direct_light) {
- if (sample_all_lights) {
- num_lights = kernel_data.integrator.num_all_lights;
- if (kernel_data.integrator.pdf_triangles != 0.0f) {
- num_lights += 1;
- }
- }
- else {
- num_lights = 1;
- }
- }
-
- for (int i = 0; i < num_lights; i++) {
- /* sample one light at random */
- int num_samples = 1;
- int num_all_lights = 1;
- uint lamp_rng_hash = state->rng_hash;
- bool double_pdf = false;
- bool is_mesh_light = false;
- bool is_lamp = false;
-
- if (sample_all_lights) {
- /* lamp sampling */
- is_lamp = i < kernel_data.integrator.num_all_lights;
- if (is_lamp) {
- if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
- continue;
- }
- num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i));
- num_all_lights = kernel_data.integrator.num_all_lights;
- lamp_rng_hash = cmj_hash(state->rng_hash, i);
- double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
- }
- /* mesh light sampling */
- else {
- num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples);
- double_pdf = kernel_data.integrator.num_all_lights != 0;
- is_mesh_light = true;
- }
- }
-
- float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights);
-
- for (int j = 0; j < num_samples; j++) {
- Ray light_ray ccl_optional_struct_init;
- light_ray.t = 0.0f; /* reset ray */
-# ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-# endif
- bool has_emission = false;
-
- if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
- float light_u, light_v;
- path_branched_rng_2D(
- kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_branched_rng_light_termination(
- kg, lamp_rng_hash, state, j, num_samples);
-
- /* only sample triangle lights */
- if (is_mesh_light && double_pdf) {
- light_u = 0.5f * light_u;
- }
-
- LightSample ls ccl_optional_struct_init;
- const int lamp = is_lamp ? i : -1;
- if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- /* The sampling probability returned by lamp_light_sample assumes that all lights were
- * sampled. However, this code only samples lamps, so if the scene also had mesh lights,
- * the real probability is twice as high. */
- if (double_pdf) {
- ls.pdf *= 2.0f;
- }
-
- has_emission = direct_emission(
- kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
- }
- }
-
- /* trace shadow ray */
- float3 shadow;
-
- const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
- if (has_emission) {
- if (!blocked) {
- /* accumulate */
- path_radiance_accum_light(kg,
- L,
- state,
- throughput * num_samples_inv,
- &L_light,
- shadow,
- num_samples_inv,
- is_lamp);
- }
- else {
- path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light);
- }
- }
- }
- }
-# endif
-}
-
-/* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
- ShaderData *sd,
- const ShaderClosure *sc,
- int sample,
- int num_samples,
- ccl_addr_space float3 *throughput,
- ccl_addr_space PathState *state,
- PathRadianceState *L_state,
- ccl_addr_space Ray *ray,
- float sum_sample_weight)
-{
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval ccl_optional_struct_init;
- float3 bsdf_omega_in ccl_optional_struct_init;
- differential3 bsdf_domega_in ccl_optional_struct_init;
- float bsdf_u, bsdf_v;
- path_branched_rng_2D(
- kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample_closure(
- kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
- if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
- return false;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-# ifdef __DENOISING_FEATURES__
- state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
-# endif
-
- /* modify path state */
- path_state_next(kg, state, label);
-
- /* setup ray */
- ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
- ray->D = normalize(bsdf_omega_in);
- ray->t = FLT_MAX;
-# ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD = bsdf_domega_in;
-# endif
-# ifdef __OBJECT_MOTION__
- ray->time = sd->time;
-# endif
-
-# ifdef __VOLUME__
- /* enter/exit volume */
- if (label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-# endif
-
- /* branch RNG state */
- path_state_branch(state, sample, num_samples);
-
- /* set MIS state */
- state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
- state->ray_pdf = bsdf_pdf;
-# ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-# endif
-
- return true;
-}
-
-#endif
-
-/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- float3 throughput,
- ccl_addr_space PathState *state,
- PathRadiance *L)
-{
- PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
-
-#ifdef __EMISSION__
-# ifdef __SHADOW_TRICKS__
- int all = (state->flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-# else
- /* sample illumination from lights to find path contribution */
- Ray light_ray ccl_optional_struct_init;
- BsdfEval L_light ccl_optional_struct_init;
- bool is_lamp = false;
- bool has_emission = false;
-
- light_ray.t = 0.0f;
-# ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-# endif
-
- if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
- float light_u, light_v;
- path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- LightSample ls ccl_optional_struct_init;
- if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- float terminate = path_state_rng_light_termination(kg, state);
- has_emission = direct_emission(
- kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
- }
- }
-
- /* trace shadow ray */
- float3 shadow;
-
- const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
- if (has_emission) {
- if (!blocked) {
- /* accumulate */
- path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
- }
- else {
- path_radiance_accum_total_light(L, state, throughput, &L_light);
- }
- }
-# endif
-#endif
-}
-
-/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space float3 *throughput,
- ccl_addr_space PathState *state,
- PathRadianceState *L_state,
- ccl_addr_space Ray *ray)
-{
- PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
-
- /* no BSDF? we can stop here */
- if (sd->flag & SD_BSDF) {
- /* sample BSDF */
- float bsdf_pdf;
- BsdfEval bsdf_eval ccl_optional_struct_init;
- float3 bsdf_omega_in ccl_optional_struct_init;
- differential3 bsdf_domega_in ccl_optional_struct_init;
- float bsdf_u, bsdf_v;
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
- int label;
-
- label = shader_bsdf_sample(
- kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
- if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
- return false;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
- /* set labels */
- if (!(label & LABEL_TRANSPARENT)) {
- state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-#endif
- state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
- }
-
- /* update path state */
- path_state_next(kg, state, label);
-
- /* setup ray */
- ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
- ray->D = normalize(bsdf_omega_in);
-
- if (state->bounce == 0)
- ray->t -= sd->ray_length; /* clipping works through transparent */
- else
- ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
- /* enter/exit volume */
- if (label & LABEL_TRANSMIT)
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
- return true;
- }
-#ifdef __VOLUME__
- else if (sd->flag & SD_HAS_ONLY_VOLUME) {
- if (!path_state_volume_next(kg, state)) {
- return false;
- }
-
- if (state->bounce == 0)
- ray->t -= sd->ray_length; /* clipping works through transparent */
- else
- ray->t = FLT_MAX;
-
- /* setup ray position, direction stays unchanged */
- ray->P = ray_offset(sd->P, -sd->Ng);
-# ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
-# endif
-
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
- return true;
- }
-#endif
- else {
- /* no bsdf or volume? */
- return false;
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
deleted file mode 100644
index a787910e65c..00000000000
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME_SCATTER__
-
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- float3 throughput,
- ccl_addr_space PathState *state,
- PathRadiance *L)
-{
-# ifdef __EMISSION__
- /* sample illumination from lights to find path contribution */
- Ray light_ray ccl_optional_struct_init;
- BsdfEval L_light ccl_optional_struct_init;
- bool is_lamp = false;
- bool has_emission = false;
-
- light_ray.t = 0.0f;
-# ifdef __OBJECT_MOTION__
- /* connect to light from given point where shader has been evaluated */
- light_ray.time = sd->time;
-# endif
-
- if (kernel_data.integrator.use_direct_light) {
- float light_u, light_v;
- path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
- LightSample ls ccl_optional_struct_init;
- if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- float terminate = path_state_rng_light_termination(kg, state);
- has_emission = direct_emission(
- kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
- }
- }
-
- /* trace shadow ray */
- float3 shadow;
-
- const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
- if (has_emission && !blocked) {
- /* accumulate */
- path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
- }
-# endif /* __EMISSION__ */
-}
-
-ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space float3 *throughput,
- ccl_addr_space PathState *state,
- PathRadianceState *L_state,
- ccl_addr_space Ray *ray)
-{
- /* sample phase function */
- float phase_pdf;
- BsdfEval phase_eval ccl_optional_struct_init;
- float3 phase_omega_in ccl_optional_struct_init;
- differential3 phase_domega_in ccl_optional_struct_init;
- float phase_u, phase_v;
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
- int label;
-
- label = shader_volume_phase_sample(
- kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf);
-
- if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
- return false;
-
- /* modify throughput */
- path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
- /* set labels */
- state->ray_pdf = phase_pdf;
-# ifdef __LAMP_MIS__
- state->ray_t = 0.0f;
-# endif
- state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
- /* update path state */
- path_state_next(kg, state, label);
-
- /* Russian roulette termination of volume ray scattering. */
- float probability = path_state_continuation_probability(kg, state, *throughput);
-
- if (probability == 0.0f) {
- return false;
- }
- else if (probability != 1.0f) {
- /* Use dimension from the previous bounce, has not been used yet. */
- float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
-
- if (terminate >= probability) {
- return false;
- }
-
- *throughput /= probability;
- }
-
- /* setup ray */
- ray->P = sd->P;
- ray->D = phase_omega_in;
- ray->t = FLT_MAX;
-
-# ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD = phase_domega_in;
-# endif
-
- return true;
-}
-
-# if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *emission_sd,
- float3 throughput,
- ccl_addr_space PathState *state,
- PathRadiance *L,
- bool sample_all_lights,
- Ray *ray,
- const VolumeSegment *segment)
-{
-# ifdef __EMISSION__
- BsdfEval L_light ccl_optional_struct_init;
-
- int num_lights = 1;
- if (sample_all_lights) {
- num_lights = kernel_data.integrator.num_all_lights;
- if (kernel_data.integrator.pdf_triangles != 0.0f) {
- num_lights += 1;
- }
- }
-
- for (int i = 0; i < num_lights; ++i) {
- /* sample one light at random */
- int num_samples = 1;
- int num_all_lights = 1;
- uint lamp_rng_hash = state->rng_hash;
- bool double_pdf = false;
- bool is_mesh_light = false;
- bool is_lamp = false;
-
- if (sample_all_lights) {
- /* lamp sampling */
- is_lamp = i < kernel_data.integrator.num_all_lights;
- if (is_lamp) {
- if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
- continue;
- }
- num_samples = light_select_num_samples(kg, i);
- num_all_lights = kernel_data.integrator.num_all_lights;
- lamp_rng_hash = cmj_hash(state->rng_hash, i);
- double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
- }
- /* mesh light sampling */
- else {
- num_samples = kernel_data.integrator.mesh_light_samples;
- double_pdf = kernel_data.integrator.num_all_lights != 0;
- is_mesh_light = true;
- }
- }
-
- float num_samples_inv = 1.0f / (num_samples * num_all_lights);
-
- for (int j = 0; j < num_samples; j++) {
- Ray light_ray ccl_optional_struct_init;
- light_ray.t = 0.0f; /* reset ray */
-# ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
-# endif
- bool has_emission = false;
-
- float3 tp = throughput;
-
- if (kernel_data.integrator.use_direct_light) {
- /* sample random position on random light/triangle */
- float light_u, light_v;
- path_branched_rng_2D(
- kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
- /* only sample triangle lights */
- if (is_mesh_light && double_pdf) {
- light_u = 0.5f * light_u;
- }
-
- LightSample ls ccl_optional_struct_init;
- const int lamp = is_lamp ? i : -1;
- light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
-
- /* sample position on volume segment */
- float rphase = path_branched_rng_1D(
- kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
- float rscatter = path_branched_rng_1D(
- kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
- VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
- state,
- ray,
- sd,
- &tp,
- rphase,
- rscatter,
- segment,
- (ls.t != FLT_MAX) ? &ls.P :
- NULL,
- false);
-
- if (result == VOLUME_PATH_SCATTERED) {
- /* todo: split up light_sample so we don't have to call it again with new position */
- if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- if (double_pdf) {
- ls.pdf *= 2.0f;
- }
-
- /* sample random light */
- float terminate = path_branched_rng_light_termination(
- kg, state->rng_hash, state, j, num_samples);
- has_emission = direct_emission(
- kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
- }
- }
- }
-
- /* trace shadow ray */
- float3 shadow;
-
- const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
- if (has_emission && !blocked) {
- /* accumulate */
- path_radiance_accum_light(
- kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
- }
- }
- }
-# endif /* __EMISSION__ */
-}
-# endif /* __SPLIT_KERNEL__ */
-
-#endif /* __VOLUME_SCATTER__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
index 780830879d8..db8644005ea 100644
--- a/intern/cycles/kernel/kernel_profiling.h
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_PROFILING_H__
-#define __KERNEL_PROFILING_H__
+#pragma once
#ifdef __KERNEL_CPU__
# include "util/util_profiling.h"
@@ -24,23 +23,18 @@
CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_CPU__
-# define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+# define PROFILING_INIT(kg, event) \
+ ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
# define PROFILING_EVENT(event) profiling_helper.set_event(event)
-# define PROFILING_SHADER(shader) \
- if ((shader) != SHADER_NONE) { \
- profiling_helper.set_shader((shader)&SHADER_MASK); \
- }
-# define PROFILING_OBJECT(object) \
- if ((object) != PRIM_NONE) { \
- profiling_helper.set_object(object); \
- }
+# define PROFILING_INIT_FOR_SHADER(kg, event) \
+ ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event)
+# define PROFILING_SHADER(object, shader) \
+ profiling_helper.set_shader(object, (shader)&SHADER_MASK);
#else
# define PROFILING_INIT(kg, event)
# define PROFILING_EVENT(event)
-# define PROFILING_SHADER(shader)
-# define PROFILING_OBJECT(object)
+# define PROFILING_INIT_FOR_SHADER(kg, event)
+# define PROFILING_SHADER(object, shader)
#endif /* __KERNEL_CPU__ */
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index c33d7150b5c..192bf7ca5aa 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -30,8 +30,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __KERNEL_PROJECTION_CL__
-#define __KERNEL_PROJECTION_CL__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam
}
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROJECTION_CL__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
deleted file mode 100644
index d8cc08b3e85..00000000000
--- a/intern/cycles/kernel/kernel_queues.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_QUEUE_H__
-#define __KERNEL_QUEUE_H__
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * Queue utility functions for split kernel
- */
-#ifdef __KERNEL_OPENCL__
-# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-# pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#endif
-
-/*
- * Enqueue ray index into the queue
- */
-ccl_device void enqueue_ray_index(
- int ray_index, /* Ray index to be enqueued. */
- int queue_number, /* Queue in which the ray index should be enqueued. */
- ccl_global int *queues, /* Buffer of all queues. */
- int queue_size, /* Size of each queue. */
- ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
-{
- /* This thread's queue index. */
- int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) +
- (queue_number * queue_size);
- queues[my_queue_index] = ray_index;
-}
-
-/*
- * Get the ray index for this thread
- * Returns a positive ray_index for threads that have to do some work;
- * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
- * i.e All ray's in the queue has been successfully allocated and there
- * is no more ray to allocate to other threads.
- */
-ccl_device int get_ray_index(
- KernelGlobals *kg,
- int thread_index, /* Global thread index. */
- int queue_number, /* Queue to operate on. */
- ccl_global int *queues, /* Buffer of all queues. */
- int queuesize, /* Size of a queue. */
- int empty_queue) /* Empty the queue slot as soon as we fetch the ray index. */
-{
- int ray_index = queues[queue_number * queuesize + thread_index];
- if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
- queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- }
- return ray_index;
-}
-
-/* The following functions are to realize Local memory variant of enqueue ray index function. */
-
-/* All threads should call this function. */
-ccl_device void enqueue_ray_index_local(
- int ray_index, /* Ray index to enqueue. */
- int queue_number, /* Queue in which to enqueue ray index. */
- char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
- int queuesize, /* queue size. */
- ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
- ccl_global int *Queue_data, /* Queues. */
- ccl_global int *Queue_index) /* To do global queue atomics. */
-{
- int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-
- /* Get local queue id. */
- unsigned int lqidx;
- if (enqueue_flag) {
- lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- /* Get global queue offset. */
- if (lidx == 0) {
- *local_queue_atomics = atomic_fetch_and_add_uint32(
- (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics);
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- /* Get global queue index and enqueue ray. */
- if (enqueue_flag) {
- unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
- Queue_data[my_gqidx] = ray_index;
- }
-}
-
-ccl_device unsigned int get_local_queue_index(
- int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
- ccl_local_param unsigned int *local_queue_atomics)
-{
- int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
- return my_lqidx;
-}
-
-ccl_device unsigned int get_global_per_queue_offset(
- int queue_number,
- ccl_local_param unsigned int *local_queue_atomics,
- ccl_global int *global_queue_atomics)
-{
- unsigned int queue_offset = atomic_fetch_and_add_uint32(
- (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]);
- return queue_offset;
-}
-
-ccl_device unsigned int get_global_queue_index(
- int queue_number,
- int queuesize,
- unsigned int lqidx,
- ccl_local_param unsigned int *global_per_queue_offset)
-{
- int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
- return my_gqidx;
-}
-
-ccl_device int dequeue_ray_index(int queue_number,
- ccl_global int *queues,
- int queue_size,
- ccl_global int *queue_index)
-{
- int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1;
-
- if (index < 0) {
- return QUEUE_EMPTY_SLOT;
- }
-
- return queues[index + queue_number * queue_size];
-}
-
-CCL_NAMESPACE_END
-
-#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 49e5e25c2e0..240c92bf9d0 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+#pragma once
#include "kernel/kernel_jitter.h"
#include "util/util_hash.h"
@@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN
*/
# define SOBOL_SKIP 64
-ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
uint i = index + SOBOL_SKIP;
for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
j += x;
- result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
+ result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
}
return result;
}
#endif /* __SOBOL__ */
-ccl_device_forceinline float path_rng_1D(
- KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg,
+ uint rng_hash,
+ int sample,
+ int dimension)
{
#ifdef __DEBUG_CORRELATION__
return (float)drand48();
#endif
- if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
- return pmj_sample_1D(kg, sample, rng_hash, dimension);
- }
-#ifdef __CMJ__
-# ifdef __SOBOL__
- if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-# endif
+
+#ifdef __SOBOL__
+ if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
{
- /* Correlated multi-jitter. */
- int p = rng_hash + dimension;
- return cmj_sample_1D(sample, num_samples, p);
+ return pmj_sample_1D(kg, sample, rng_hash, dimension);
}
-#endif
#ifdef __SOBOL__
/* Sobol sequence value using direction vectors. */
@@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D(
#endif
}
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
- uint rng_hash,
- int sample,
- int num_samples,
- int dimension,
- float *fx,
- float *fy)
+ccl_device_forceinline void path_rng_2D(
+ const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy)
{
#ifdef __DEBUG_CORRELATION__
*fx = (float)drand48();
*fy = (float)drand48();
return;
#endif
- if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
- const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
- *fx = f.x;
- *fy = f.y;
- return;
- }
-#ifdef __CMJ__
-# ifdef __SOBOL__
- if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-# endif
+
+#ifdef __SOBOL__
+ if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
{
- /* Correlated multi-jitter. */
- int p = rng_hash + dimension;
- cmj_sample_2D(sample, num_samples, p, fx, fy);
+ pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+
return;
}
-#endif
#ifdef __SOBOL__
/* Sobol. */
- *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
- *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+ *fx = path_rng_1D(kg, rng_hash, sample, dimension);
+ *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
#endif
}
-ccl_device_inline void path_rng_init(KernelGlobals *kg,
- int sample,
- int num_samples,
- uint *rng_hash,
- int x,
- int y,
- float *fx,
- float *fy)
+/**
+ * 1D hash recommended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqint1(uint n)
+{
+ n = (n << 13U) ^ n;
+ n = n * (n * n * 15731U + 789221U) + 1376312589U;
+
+ return n;
+}
+
+/**
+ * 2D hash recommended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqnt2d(const uint x, const uint y)
{
- /* load state */
- *rng_hash = hash_uint2(x, y);
- *rng_hash ^= kernel_data.integrator.seed;
+ const uint qx = 1103515245U * ((x >> 1U) ^ (y));
+ const uint qy = 1103515245U * ((y >> 1U) ^ (x));
+ const uint n = 1103515245U * ((qx) ^ (qy >> 3U));
+
+ return n;
+}
+
+ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg,
+ const int sample,
+ const int x,
+ const int y)
+{
+ const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed;
#ifdef __DEBUG_CORRELATION__
- srand48(*rng_hash + sample);
+ srand48(rng_hash + sample);
+#else
+ (void)sample;
#endif
- if (sample == 0) {
- *fx = 0.5f;
- *fy = 0.5f;
- }
- else {
- path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
- }
+ return rng_hash;
}
/* Linear Congruential Generator */
@@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed)
return rng;
}
-/* Path Tracing Utility Functions
- *
- * For each random number in each step of the path we must have a unique
- * dimension to avoid using the same sequence twice.
- *
- * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly.
- */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
- const ccl_addr_space PathState *state,
- int dimension)
-{
- return path_rng_1D(
- kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_state_rng_2D(
- KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
-{
- path_rng_2D(kg,
- state->rng_hash,
- state->sample,
- state->num_samples,
- state->rng_offset + dimension,
- fx,
- fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
- const ccl_addr_space PathState *state,
- uint hash)
-{
- /* Use a hash instead of dimension, this is not great but avoids adding
- * more dimensions to each bounce which reduces quality of dimensions we
- * are already using. */
- return path_rng_1D(kg,
- cmj_hash_simple(state->rng_hash, hash),
- state->sample,
- state->num_samples,
- state->rng_offset);
-}
-
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg,
- uint rng_hash,
- const ccl_addr_space PathState *state,
- int branch,
- int num_branches,
- int dimension)
-{
- return path_rng_1D(kg,
- rng_hash,
- state->sample * num_branches + branch,
- state->num_samples * num_branches,
- state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
- uint rng_hash,
- const ccl_addr_space PathState *state,
- int branch,
- int num_branches,
- int dimension,
- float *fx,
- float *fy)
-{
- path_rng_2D(kg,
- rng_hash,
- state->sample * num_branches + branch,
- state->num_samples * num_branches,
- state->rng_offset + dimension,
- fx,
- fy);
-}
-
-/* Utility functions to get light termination value,
- * since it might not be needed in many cases.
- */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
- const ccl_addr_space PathState *state)
-{
- if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
- }
- return 0.0f;
-}
-
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg,
- uint rng_hash,
- const ccl_addr_space PathState *state,
- int branch,
- int num_branches)
-{
- if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
- }
- return 0.0f;
-}
-
-ccl_device_inline uint lcg_state_init(PathState *state, uint scramble)
-{
- return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
-}
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(const uint rng_hash,
+ const uint rng_offset,
+ const uint sample,
+ const uint scramble)
{
- return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
+ return lcg_init(rng_hash + rng_offset + sample * scramble);
}
ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
@@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
#elif defined(__NVCC__)
return __popc(sample & 0xaaaaaaaa) & 1;
-#elif defined(__KERNEL_OPENCL__)
- return popcount(sample & 0xaaaaaaaa) & 1;
#else
/* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
int i = sample & 0xaaaaaaaa;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7f02e6fc7b3..3052bb53040 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -14,14 +14,9 @@
* limitations under the License.
*/
-/*
- * ShaderData, used in four steps:
- *
- * Setup from incoming ray, sampled position and background.
- * Execute for surface, volume or displacement.
- * Evaluate one or more closures.
- * Release.
- */
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
// clang-format off
#include "kernel/closure/alloc.h"
@@ -30,479 +25,39 @@
#include "kernel/closure/emissive.h"
// clang-format on
+#include "kernel/kernel_accumulate.h"
#include "kernel/svm/svm.h"
-CCL_NAMESPACE_BEGIN
-
-/* ShaderData setup from incoming ray */
-
-#ifdef __OBJECT_MOTION__
-ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
-{
- if (sd->object_flag & SD_OBJECT_MOTION) {
- sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
- sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
- }
- else {
- sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
- sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
- }
-}
-#endif
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
- void
- shader_setup_from_ray(KernelGlobals *kg,
- ShaderData *sd,
- const Intersection *isect,
- const Ray *ray)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
- sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
- isect->object;
- sd->lamp = LAMP_NONE;
-
- sd->type = isect->type;
- sd->flag = 0;
- sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-
- /* matrices and time */
-#ifdef __OBJECT_MOTION__
- shader_setup_object_transforms(kg, sd, ray->time);
-#endif
- sd->time = ray->time;
-
- sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
- sd->ray_length = isect->t;
-
- sd->u = isect->u;
- sd->v = isect->v;
-
-#ifdef __HAIR__
- if (sd->type & PRIMITIVE_ALL_CURVE) {
- /* curve */
- curve_shader_setup(kg, sd, isect, ray);
- }
- else
-#endif
- if (sd->type & PRIMITIVE_TRIANGLE) {
- /* static triangle */
- float3 Ng = triangle_normal(kg, sd);
- sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
- /* vectors */
- sd->P = triangle_refine(kg, sd, isect, ray);
- sd->Ng = Ng;
- sd->N = Ng;
-
- /* smooth normal */
- if (sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#ifdef __DPDU__
- /* dPdu/dPdv */
- triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#endif
- }
- else {
- /* motion triangle */
- motion_triangle_shader_setup(kg, sd, isect, ray, false);
- }
-
- sd->I = -ray->D;
-
- sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
- if (isect->object != OBJECT_NONE) {
- /* instance transform */
- object_normal_transform_auto(kg, sd, &sd->N);
- object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
- object_dir_transform_auto(kg, sd, &sd->dPdu);
- object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
- }
-
- /* backfacing test */
- bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
- if (backfacing) {
- sd->flag |= SD_BACKFACING;
- sd->Ng = -sd->Ng;
- sd->N = -sd->N;
-#ifdef __DPDU__
- sd->dPdu = -sd->dPdu;
- sd->dPdv = -sd->dPdv;
-#endif
- }
-
-#ifdef __RAY_DIFFERENTIALS__
- /* differentials */
- differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
- differential_incoming(&sd->dI, ray->dD);
- differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-#endif
-
- PROFILING_SHADER(sd->shader);
- PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from BSSRDF scatter */
-
-#ifdef __SUBSURFACE__
-# ifndef __KERNEL_CUDA__
-ccl_device
-# else
-ccl_device_inline
-# endif
- void
- shader_setup_from_subsurface(KernelGlobals *kg,
- ShaderData *sd,
- const Intersection *isect,
- const Ray *ray)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
- const bool backfacing = sd->flag & SD_BACKFACING;
-
- /* object, matrices, time, ray_length stay the same */
- sd->flag = 0;
- sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
- sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
- sd->type = isect->type;
-
- sd->u = isect->u;
- sd->v = isect->v;
-
- /* fetch triangle data */
- if (sd->type == PRIMITIVE_TRIANGLE) {
- float3 Ng = triangle_normal(kg, sd);
- sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
- /* static triangle */
- sd->P = triangle_refine_local(kg, sd, isect, ray);
- sd->Ng = Ng;
- sd->N = Ng;
-
- if (sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-# ifdef __DPDU__
- /* dPdu/dPdv */
- triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-# endif
- }
- else {
- /* motion triangle */
- motion_triangle_shader_setup(kg, sd, isect, ray, true);
- }
-
- sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
- if (isect->object != OBJECT_NONE) {
- /* instance transform */
- object_normal_transform_auto(kg, sd, &sd->N);
- object_normal_transform_auto(kg, sd, &sd->Ng);
-# ifdef __DPDU__
- object_dir_transform_auto(kg, sd, &sd->dPdu);
- object_dir_transform_auto(kg, sd, &sd->dPdv);
-# endif
- }
-
- /* backfacing test */
- if (backfacing) {
- sd->flag |= SD_BACKFACING;
- sd->Ng = -sd->Ng;
- sd->N = -sd->N;
-# ifdef __DPDU__
- sd->dPdu = -sd->dPdu;
- sd->dPdv = -sd->dPdv;
-# endif
- }
-
- /* should not get used in principle as the shading will only use a diffuse
- * BSDF, but the shader might still access it */
- sd->I = sd->N;
-
-# ifdef __RAY_DIFFERENTIALS__
- /* differentials */
- differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
- /* don't modify dP and dI */
-# endif
-
- PROFILING_SHADER(sd->shader);
-}
-#endif
-
-/* ShaderData setup from position sampled on mesh */
-
-ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
- ShaderData *sd,
- const float3 P,
- const float3 Ng,
- const float3 I,
- int shader,
- int object,
- int prim,
- float u,
- float v,
- float t,
- float time,
- bool object_space,
- int lamp)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
- /* vectors */
- sd->P = P;
- sd->N = Ng;
- sd->Ng = Ng;
- sd->I = I;
- sd->shader = shader;
- if (prim != PRIM_NONE)
- sd->type = PRIMITIVE_TRIANGLE;
- else if (lamp != LAMP_NONE)
- sd->type = PRIMITIVE_LAMP;
- else
- sd->type = PRIMITIVE_NONE;
-
- /* primitive */
- sd->object = object;
- sd->lamp = LAMP_NONE;
- /* Currently no access to bvh prim index for strand sd->prim. */
- sd->prim = prim;
- sd->u = u;
- sd->v = v;
- sd->time = time;
- sd->ray_length = t;
-
- sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
- sd->object_flag = 0;
- if (sd->object != OBJECT_NONE) {
- sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#ifdef __OBJECT_MOTION__
- shader_setup_object_transforms(kg, sd, time);
- }
- else if (lamp != LAMP_NONE) {
- sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
- sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
- sd->lamp = lamp;
-#else
- }
- else if (lamp != LAMP_NONE) {
- sd->lamp = lamp;
-#endif
- }
-
- /* transform into world space */
- if (object_space) {
- object_position_transform_auto(kg, sd, &sd->P);
- object_normal_transform_auto(kg, sd, &sd->Ng);
- sd->N = sd->Ng;
- object_dir_transform_auto(kg, sd, &sd->I);
- }
-
- if (sd->type & PRIMITIVE_TRIANGLE) {
- /* smooth normal */
- if (sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
- if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
- object_normal_transform_auto(kg, sd, &sd->N);
- }
- }
-
- /* dPdu/dPdv */
-#ifdef __DPDU__
- triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-
- if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
- object_dir_transform_auto(kg, sd, &sd->dPdu);
- object_dir_transform_auto(kg, sd, &sd->dPdv);
- }
-#endif
- }
- else {
-#ifdef __DPDU__
- sd->dPdu = zero_float3();
- sd->dPdv = zero_float3();
-#endif
- }
-
- /* backfacing test */
- if (sd->prim != PRIM_NONE) {
- bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
- if (backfacing) {
- sd->flag |= SD_BACKFACING;
- sd->Ng = -sd->Ng;
- sd->N = -sd->N;
-#ifdef __DPDU__
- sd->dPdu = -sd->dPdu;
- sd->dPdv = -sd->dPdv;
-#endif
- }
- }
-
-#ifdef __RAY_DIFFERENTIALS__
- /* no ray differentials here yet */
- sd->dP = differential3_zero();
- sd->dI = differential3_zero();
- sd->du = differential_zero();
- sd->dv = differential_zero();
-#endif
-
- PROFILING_SHADER(sd->shader);
- PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup for displacement */
-
-ccl_device void shader_setup_from_displace(
- KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v)
-{
- float3 P, Ng, I = zero_float3();
- int shader;
-
- triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
- /* force smooth shading for displacement */
- shader |= SHADER_SMOOTH_NORMAL;
-
- shader_setup_from_sample(
- kg,
- sd,
- P,
- Ng,
- I,
- shader,
- object,
- prim,
- u,
- v,
- 0.0f,
- 0.5f,
- !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
- LAMP_NONE);
-}
-
-/* ShaderData setup from ray into background */
-
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
- ShaderData *sd,
- const Ray *ray)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
- /* vectors */
- sd->P = ray->D;
- sd->N = -ray->D;
- sd->Ng = -ray->D;
- sd->I = -ray->D;
- sd->shader = kernel_data.background.surface_shader;
- sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
- sd->object_flag = 0;
- sd->time = ray->time;
- sd->ray_length = 0.0f;
-
- sd->object = OBJECT_NONE;
- sd->lamp = LAMP_NONE;
- sd->prim = PRIM_NONE;
- sd->u = 0.0f;
- sd->v = 0.0f;
-
-#ifdef __DPDU__
- /* dPdu/dPdv */
- sd->dPdu = zero_float3();
- sd->dPdv = zero_float3();
-#endif
-
-#ifdef __RAY_DIFFERENTIALS__
- /* differentials */
- sd->dP = ray->dD;
- differential_incoming(&sd->dI, sd->dP);
- sd->du = differential_zero();
- sd->dv = differential_zero();
+#ifdef __OSL__
+# include "kernel/osl/osl_shader.h"
#endif
- /* for NDC coordinates */
- sd->ray_P = ray->P;
-
- PROFILING_SHADER(sd->shader);
- PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from point inside volume */
-
-#ifdef __VOLUME__
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
-{
- PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
- /* vectors */
- sd->P = ray->P;
- sd->N = -ray->D;
- sd->Ng = -ray->D;
- sd->I = -ray->D;
- sd->shader = SHADER_NONE;
- sd->flag = 0;
- sd->object_flag = 0;
- sd->time = ray->time;
- sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
-
- sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
- sd->lamp = LAMP_NONE;
- sd->prim = PRIM_NONE;
- sd->type = PRIMITIVE_NONE;
-
- sd->u = 0.0f;
- sd->v = 0.0f;
-
-# ifdef __DPDU__
- /* dPdu/dPdv */
- sd->dPdu = zero_float3();
- sd->dPdv = zero_float3();
-# endif
-
-# ifdef __RAY_DIFFERENTIALS__
- /* differentials */
- sd->dP = ray->dD;
- differential_incoming(&sd->dI, sd->dP);
- sd->du = differential_zero();
- sd->dv = differential_zero();
-# endif
-
- /* for NDC coordinates */
- sd->ray_P = ray->P;
- sd->ray_dP = ray->dP;
-
- PROFILING_SHADER(sd->shader);
- PROFILING_OBJECT(sd->object);
-}
-#endif /* __VOLUME__ */
+CCL_NAMESPACE_BEGIN
/* Merging */
-#if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device_inline void shader_merge_closures(ShaderData *sd)
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ShaderData *sd)
{
- /* merge identical closures, better when we sample a single closure at a time */
+ /* Merge identical closures to save closure space with stacked volumes. */
for (int i = 0; i < sd->num_closure; i++) {
ShaderClosure *sci = &sd->closure[i];
+ if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+ continue;
+ }
+
for (int j = i + 1; j < sd->num_closure; j++) {
ShaderClosure *scj = &sd->closure[j];
-
- if (sci->type != scj->type)
+ if (sci->type != scj->type) {
continue;
- if (!bsdf_merge(sci, scj))
+ }
+
+ const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci;
+ const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj;
+ if (!(hgi->g == hgj->g)) {
continue;
+ }
sci->weight += scj->weight;
sci->sample_weight += scj->sample_weight;
@@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
}
}
}
-#endif /* __BRANCHED_PATH__ || __VOLUME__ */
-/* Defensive sampling. */
+ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases,
+ const ShaderData *ccl_restrict sd)
+{
+ phases->num_closure = 0;
+
+ for (int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *from_sc = &sd->closure[i];
+ const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc;
+
+ if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+ ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+ to_sc->weight = from_sc->weight;
+ to_sc->sample_weight = from_sc->sample_weight;
+ to_sc->g = from_hg->g;
+ phases->num_closure++;
+ if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+ break;
+ }
+ }
+ }
+}
+#endif /* __VOLUME__ */
-ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state)
+ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
{
- /* We can likely also do defensive sampling at deeper bounces, particularly
+ /* Defensive sampling.
+ *
+ * We can likely also do defensive sampling at deeper bounces, particularly
* for cases like a perfect mirror but possibly also others. This will need
* a good heuristic. */
- if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+ if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 &&
+ sd->num_closure > 1) {
float sum = 0.0f;
for (int i = 0; i < sd->num_closure; i++) {
@@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa
}
}
}
+
+ /* Filter glossy.
+ *
+ * Blurring of bsdf after bounces, for rays that have a small likelihood
+ * of following this particular path (diffuse, rough glossy) */
+ if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+ float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf);
+
+ if (blur_pdf < 1.0f) {
+ float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+ for (int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if (CLOSURE_IS_BSDF(sc->type)) {
+ bsdf_blur(kg, sc, blur_roughness);
+ }
+ }
+ }
+ }
}
/* BSDF */
-ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg,
- ShaderData *sd,
- const float3 omega_in,
- float *pdf,
- const ShaderClosure *skip_sc,
- BsdfEval *result_eval,
- float sum_pdf,
- float sum_sample_weight)
+ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in)
+{
+ return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+ if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+ return false;
+ }
+ if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+ if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) {
+ return true;
+ }
+ }
+ if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+ if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+ return true;
+ }
+ }
+ if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+ if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
+ ShaderData *sd,
+ const float3 omega_in,
+ const bool is_transmission,
+ const ShaderClosure *skip_sc,
+ BsdfEval *result_eval,
+ float sum_pdf,
+ float sum_sample_weight,
+ const uint light_shader_flags)
{
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
for (int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
- if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
- float bsdf_pdf = 0.0f;
- float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+ if (sc == skip_sc) {
+ continue;
+ }
+
+ if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+ float bsdf_pdf = 0.0f;
+ float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
- if (bsdf_pdf != 0.0f) {
- bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f);
- sum_pdf += bsdf_pdf * sc->sample_weight;
+ if (bsdf_pdf != 0.0f) {
+ const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+ CLOSURE_IS_BSDF_BSSRDF(sc->type));
+ bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+ sum_pdf += bsdf_pdf * sc->sample_weight;
+ }
}
sum_sample_weight += sc->sample_weight;
}
}
- *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
- ShaderData *sd,
- const float3 omega_in,
- BsdfEval *result_eval,
- float light_pdf,
- bool use_mis)
-{
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
- if (CLOSURE_IS_BSDF(sc->type)) {
- float bsdf_pdf = 0.0f;
- float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
- if (bsdf_pdf != 0.0f) {
- float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f;
- bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight);
- }
- }
- }
+ return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
}
-#endif /* __BRANCHED_PATH__ */
#ifndef __KERNEL_CUDA__
ccl_device
#else
ccl_device_inline
#endif
- void
- shader_bsdf_eval(KernelGlobals *kg,
+ float
+ shader_bsdf_eval(const KernelGlobals *kg,
ShaderData *sd,
const float3 omega_in,
- BsdfEval *eval,
- float light_pdf,
- bool use_mis)
+ const bool is_transmission,
+ BsdfEval *bsdf_eval,
+ const uint light_shader_flags)
{
- PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
-
- bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
+ bsdf_eval_init(bsdf_eval, false, zero_float3());
-#ifdef __BRANCHED_PATH__
- if (kernel_data.integrator.branched)
- _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis);
- else
-#endif
- {
- float pdf;
- _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
- if (use_mis) {
- float weight = power_heuristic(light_pdf, pdf);
- bsdf_eval_mis(eval, weight);
- }
- }
+ return _shader_bsdf_multi_eval(
+ kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
}
-ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu)
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd,
+ float *randu)
{
- /* Note the sampling here must match shader_bssrdf_pick,
- * since we reuse the same random number. */
int sampled = 0;
if (sd->num_closure > 1) {
@@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
}
}
- const ShaderClosure *sc = &sd->closure[sampled];
- return CLOSURE_IS_BSDF(sc->type) ? sc : NULL;
+ return &sd->closure[sampled];
}
-ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
- ccl_addr_space float3 *throughput,
- float *randu)
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd,
+ const ShaderClosure *ccl_restrict bssrdf_sc)
{
- /* Note the sampling here must match shader_bsdf_pick,
- * since we reuse the same random number. */
- int sampled = 0;
+ float3 weight = bssrdf_sc->weight;
if (sd->num_closure > 1) {
- /* Pick a BSDF or BSSRDF or based on sample weights. */
- float sum_bsdf = 0.0f;
- float sum_bssrdf = 0.0f;
-
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- if (CLOSURE_IS_BSDF(sc->type)) {
- sum_bsdf += sc->sample_weight;
- }
- else if (CLOSURE_IS_BSSRDF(sc->type)) {
- sum_bssrdf += sc->sample_weight;
- }
- }
-
- float r = (*randu) * (sum_bsdf + sum_bssrdf);
- float partial_sum = 0.0f;
-
+ float sum = 0.0f;
for (int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
- float next_sum = partial_sum + sc->sample_weight;
-
- if (r < next_sum) {
- if (CLOSURE_IS_BSDF(sc->type)) {
- *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
- return NULL;
- }
- else {
- *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
- sampled = i;
-
- /* Rescale to reuse for direction sample, to better preserve stratification. */
- *randu = (r - partial_sum) / sc->sample_weight;
- break;
- }
- }
-
- partial_sum = next_sum;
+ sum += sc->sample_weight;
}
}
+ weight *= sum / bssrdf_sc->sample_weight;
}
- const ShaderClosure *sc = &sd->closure[sampled];
- return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL;
-}
-
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
- ShaderData *sd,
- float randu,
- float randv,
- BsdfEval *bsdf_eval,
- float3 *omega_in,
- differential3 *domega_in,
- float *pdf)
-{
- PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
-
- const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
- if (sc == NULL) {
- *pdf = 0.0f;
- return LABEL_NONE;
- }
-
- /* BSSRDF should already have been handled elsewhere. */
- kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
- int label;
- float3 eval = zero_float3();
-
- *pdf = 0.0f;
- label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
- if (*pdf != 0.0f) {
- bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
-
- if (sd->num_closure > 1) {
- float sweight = sc->sample_weight;
- _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight);
- }
- }
-
- return label;
+ return weight;
}
-ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg,
ShaderData *sd,
const ShaderClosure *sc,
float randu,
@@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
differential3 *domega_in,
float *pdf)
{
- PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+ /* BSSRDF should already have been handled elsewhere. */
+ kernel_assert(CLOSURE_IS_BSDF(sc->type));
int label;
float3 eval = zero_float3();
@@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
*pdf = 0.0f;
label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
- if (*pdf != 0.0f)
- bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
+ if (*pdf != 0.0f) {
+ const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+ CLOSURE_IS_BSDF_BSSRDF(sc->type));
+ bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+ if (sd->num_closure > 1) {
+ const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+ float sweight = sc->sample_weight;
+ *pdf = _shader_bsdf_multi_eval(
+ kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+ }
+ }
return label;
}
-ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+ccl_device float shader_bsdf_average_roughness(const ShaderData *sd)
{
float roughness = 0.0f;
float sum_weight = 0.0f;
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF(sc->type)) {
/* sqrt once to undo the squaring from multiplying roughness on the
@@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
}
-ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
-{
- for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if (CLOSURE_IS_BSDF(sc->type))
- bsdf_blur(kg, sc, roughness);
- }
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd)
{
if (sd->flag & SD_HAS_ONLY_VOLUME) {
return one_float3();
@@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *
}
}
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd)
{
if (sd->flag & SD_TRANSPARENT) {
for (int i = 0; i < sd->num_closure; i++) {
@@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *
}
}
-ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd)
{
float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
@@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
return alpha;
}
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd)
{
float3 eval = zero_float3();
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
CLOSURE_IS_BSDF_BSSRDF(sc->type))
@@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
return eval;
}
-ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd)
{
float3 eval = zero_float3();
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
eval += sc->weight;
@@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
return eval;
}
-ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd)
{
float3 eval = zero_float3();
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
eval += sc->weight;
@@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
return eval;
}
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd)
{
float3 N = zero_float3();
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
N += sc->N * fabsf(average(sc->weight));
}
@@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
return (is_zero(N)) ? sd->N : normalize(N);
}
-ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd)
{
- float3 eval = zero_float3();
float3 N = zero_float3();
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc;
- eval += sc->weight * ao_factor;
N += bsdf->N * fabsf(average(sc->weight));
}
}
- *N_ = (is_zero(N)) ? sd->N : normalize(N);
- return eval;
+ return (is_zero(N)) ? sd->N : normalize(N);
}
#ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_normal(const ShaderData *sd)
{
- float3 eval = zero_float3();
float3 N = zero_float3();
- float texture_blur = 0.0f, weight_sum = 0.0f;
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSSRDF(sc->type)) {
const Bssrdf *bssrdf = (const Bssrdf *)sc;
float avg_weight = fabsf(average(sc->weight));
N += bssrdf->N * avg_weight;
- eval += sc->weight;
- texture_blur += bssrdf->texture_blur * avg_weight;
- weight_sum += avg_weight;
}
}
- if (N_)
- *N_ = (is_zero(N)) ? sd->N : normalize(N);
-
- if (texture_blur_)
- *texture_blur_ = safe_divide(texture_blur, weight_sum);
-
- return eval;
+ return (is_zero(N)) ? sd->N : normalize(N);
}
#endif /* __SUBSURFACE__ */
/* Constant emission optimization */
-ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval)
{
int shader_index = shader & SHADER_MASK;
int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
@@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo
/* Background */
-ccl_device float3 shader_background_eval(ShaderData *sd)
+ccl_device float3 shader_background_eval(const ShaderData *sd)
{
if (sd->flag & SD_EMISSION) {
return sd->closure_emission_background;
@@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd)
/* Emission */
-ccl_device float3 shader_emissive_eval(ShaderData *sd)
+ccl_device float3 shader_emissive_eval(const ShaderData *sd)
{
if (sd->flag & SD_EMISSION) {
return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
/* Holdout */
-ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd)
{
float3 weight = zero_float3();
@@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
}
else {
for (int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_HOLDOUT(sc->type)) {
weight += sc->weight;
}
@@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
/* Surface Evaluation */
-ccl_device void shader_eval_surface(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ccl_global float *buffer,
+template<uint node_feature_mask>
+ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *ccl_restrict sd,
+ ccl_global float *ccl_restrict buffer,
int path_flag)
{
- PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
-
/* If path is being terminated, we are tracing a shadow ray or evaluating
* emission, then we don't need to store closures. The emission and shadow
* shader data also do not have a closure array to save GPU memory. */
@@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
max_closures = 0;
}
else {
- max_closures = kernel_data.integrator.max_closures;
+ max_closures = kernel_data.max_closures;
}
sd->num_closure = 0;
@@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
#ifdef __OSL__
if (kg->osl) {
if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
- OSLShader::eval_background(kg, sd, state, path_flag);
+ OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag);
}
else {
- OSLShader::eval_surface(kg, sd, state, path_flag);
+ OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag);
}
}
else
#endif
{
#ifdef __SVM__
- svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
+ svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(
+ INTEGRATOR_STATE_PASS, sd, buffer, path_flag);
#else
if (sd->object == OBJECT_NONE) {
sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
#endif
}
- if (sd->flag & SD_BSDF_NEEDS_LCG) {
- sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
+ if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+ sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+ INTEGRATOR_STATE(path, rng_offset),
+ INTEGRATOR_STATE(path, sample),
+ 0xb4bc3953);
}
}
@@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
#ifdef __VOLUME__
-ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd,
- const float3 omega_in,
- float *pdf,
- int skip_phase,
- BsdfEval *result_eval,
- float sum_pdf,
- float sum_sample_weight)
+ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd,
+ const ShaderVolumePhases *phases,
+ const float3 omega_in,
+ int skip_phase,
+ BsdfEval *result_eval,
+ float sum_pdf,
+ float sum_sample_weight)
{
- for (int i = 0; i < sd->num_closure; i++) {
+ for (int i = 0; i < phases->num_closure; i++) {
if (i == skip_phase)
continue;
- const ShaderClosure *sc = &sd->closure[i];
-
- if (CLOSURE_IS_PHASE(sc->type)) {
- float phase_pdf = 0.0f;
- float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+ const ShaderVolumeClosure *svc = &phases->closure[i];
+ float phase_pdf = 0.0f;
+ float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
- if (phase_pdf != 0.0f) {
- bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
- sum_pdf += phase_pdf * sc->sample_weight;
- }
-
- sum_sample_weight += sc->sample_weight;
+ if (phase_pdf != 0.0f) {
+ bsdf_eval_accum(result_eval, false, eval, 1.0f);
+ sum_pdf += phase_pdf * svc->sample_weight;
}
+
+ sum_sample_weight += svc->sample_weight;
}
- *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+ return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
}
-ccl_device void shader_volume_phase_eval(
- KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf)
+ccl_device float shader_volume_phase_eval(const KernelGlobals *kg,
+ const ShaderData *sd,
+ const ShaderVolumePhases *phases,
+ const float3 omega_in,
+ BsdfEval *phase_eval)
{
- PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+ bsdf_eval_init(phase_eval, false, zero_float3());
- bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
-
- _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
+ return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
}
-ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
+ccl_device int shader_volume_phase_sample(const KernelGlobals *kg,
const ShaderData *sd,
+ const ShaderVolumePhases *phases,
float randu,
float randv,
BsdfEval *phase_eval,
@@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
differential3 *domega_in,
float *pdf)
{
- PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
int sampled = 0;
- if (sd->num_closure > 1) {
+ if (phases->num_closure > 1) {
/* pick a phase closure based on sample weights */
float sum = 0.0f;
- for (sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
-
- if (CLOSURE_IS_PHASE(sc->type))
- sum += sc->sample_weight;
+ for (sampled = 0; sampled < phases->num_closure; sampled++) {
+ const ShaderVolumeClosure *svc = &phases->closure[sampled];
+ sum += svc->sample_weight;
}
float r = randu * sum;
float partial_sum = 0.0f;
- for (sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
+ for (sampled = 0; sampled < phases->num_closure; sampled++) {
+ const ShaderVolumeClosure *svc = &phases->closure[sampled];
+ float next_sum = partial_sum + svc->sample_weight;
- if (CLOSURE_IS_PHASE(sc->type)) {
- float next_sum = partial_sum + sc->sample_weight;
-
- if (r <= next_sum) {
- /* Rescale to reuse for BSDF direction sample. */
- randu = (r - partial_sum) / sc->sample_weight;
- break;
- }
-
- partial_sum = next_sum;
+ if (r <= next_sum) {
+ /* Rescale to reuse for BSDF direction sample. */
+ randu = (r - partial_sum) / svc->sample_weight;
+ break;
}
+
+ partial_sum = next_sum;
}
- if (sampled == sd->num_closure) {
+ if (sampled == phases->num_closure) {
*pdf = 0.0f;
return LABEL_NONE;
}
@@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
/* todo: this isn't quite correct, we don't weight anisotropy properly
* depending on color channels, even if this is perhaps not a common case */
- const ShaderClosure *sc = &sd->closure[sampled];
+ const ShaderVolumeClosure *svc = &phases->closure[sampled];
int label;
float3 eval = zero_float3();
*pdf = 0.0f;
- label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+ label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
if (*pdf != 0.0f) {
- bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+ bsdf_eval_init(phase_eval, false, eval);
}
return label;
}
-ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
+ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
const ShaderData *sd,
- const ShaderClosure *sc,
+ const ShaderVolumeClosure *sc,
float randu,
float randv,
BsdfEval *phase_eval,
@@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
differential3 *domega_in,
float *pdf)
{
- PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
int label;
float3 eval = zero_float3();
@@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
if (*pdf != 0.0f)
- bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+ bsdf_eval_init(phase_eval, false, eval);
return label;
}
/* Volume Evaluation */
-ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ccl_addr_space VolumeStack *stack,
- int path_flag)
+template<typename StackReadOp>
+ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *ccl_restrict sd,
+ const int path_flag,
+ StackReadOp stack_read)
{
/* If path is being terminated, we are tracing a shadow ray or evaluating
* emission, then we don't need to store closures. The emission and shadow
@@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
max_closures = 0;
}
else {
- max_closures = kernel_data.integrator.max_closures;
+ max_closures = kernel_data.max_closures;
}
/* reset closures once at the start, we will be accumulating the closures
@@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
sd->num_closure_left = max_closures;
sd->flag = 0;
sd->object_flag = 0;
- sd->type = PRIMITIVE_VOLUME;
- for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
+ for (int i = 0;; i++) {
+ const VolumeStack entry = stack_read(i);
+ if (entry.shader == SHADER_NONE) {
+ break;
+ }
+
/* setup shaderdata from stack. it's mostly setup already in
* shader_setup_from_volume, this switching should be quick */
- sd->object = stack[i].object;
+ sd->object = entry.object;
sd->lamp = LAMP_NONE;
- sd->shader = stack[i].shader;
+ sd->shader = entry.shader;
sd->flag &= ~SD_SHADER_FLAGS;
sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
@@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
# ifdef __SVM__
# ifdef __OSL__
if (kg->osl) {
- OSLShader::eval_volume(kg, sd, state, path_flag);
+ OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag);
}
else
# endif
{
- svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
+ svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+ INTEGRATOR_STATE_PASS, sd, NULL, path_flag);
}
# endif
- /* merge closures to avoid exceeding number of closures limit */
+ /* Merge closures to avoid exceeding number of closures limit. */
if (i > 0)
- shader_merge_closures(sd);
+ shader_merge_volume_closures(sd);
}
}
@@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
/* Displacement Evaluation */
-ccl_device void shader_eval_displacement(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state)
+ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
{
sd->num_closure = 0;
sd->num_closure_left = 0;
@@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
#ifdef __SVM__
# ifdef __OSL__
if (kg->osl)
- OSLShader::eval_displacement(kg, sd, state);
+ OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd);
else
# endif
{
- svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
+ svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+ INTEGRATOR_STATE_PASS, sd, NULL, 0);
}
#endif
}
@@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
/* Transparent Shadows */
#ifdef __TRANSPARENT_SHADOWS__
-ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect)
{
- int prim = kernel_tex_fetch(__prim_index, isect->prim);
- int shader = 0;
-
-# ifdef __HAIR__
- if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
-# endif
- shader = kernel_tex_fetch(__tri_shader, prim);
-# ifdef __HAIR__
- }
- else {
- float4 str = kernel_tex_fetch(__curves, prim);
- shader = __float_as_int(str.z);
- }
-# endif
- int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
- return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
+ return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0;
}
#endif /* __TRANSPARENT_SHADOWS__ */
-ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader)
{
return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
}
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
deleted file mode 100644
index 3b124122fba..00000000000
--- a/intern/cycles/kernel/kernel_shadow.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-/* Get PathState ready for use for volume stack evaluation. */
-# ifdef __SPLIT_KERNEL__
-ccl_addr_space
-# endif
- ccl_device_inline PathState *
- shadow_blocked_volume_path_state(KernelGlobals *kg,
- VolumeState *volume_state,
- ccl_addr_space PathState *state,
- ShaderData *sd,
- Ray *ray)
-{
-# ifdef __SPLIT_KERNEL__
- ccl_addr_space PathState *ps =
- &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-# else
- PathState *ps = &volume_state->ps;
-# endif
- *ps = *state;
- /* We are checking for shadow on the "other" side of the surface, so need
- * to discard volume we are currently at.
- */
- if (dot(sd->Ng, ray->D) < 0.0f) {
- kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
- }
- return ps;
-}
-#endif /* __VOLUME__ */
-
-/* Attenuate throughput accordingly to the given intersection event.
- * Returns true if the throughput is zero and traversal can be aborted.
- */
-ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
-#ifdef __VOLUME__
- ccl_addr_space PathState *volume_state,
-#endif
- Intersection *isect,
- Ray *ray,
- float3 *throughput)
-{
-#ifdef __VOLUME__
- /* Attenuation between last surface and next surface. */
- if (volume_state->volume_stack[0].shader != SHADER_NONE) {
- Ray segment_ray = *ray;
- segment_ray.t = isect->t;
- kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput);
- }
-#endif
- /* Setup shader data at surface. */
- shader_setup_from_ray(kg, shadow_sd, isect, ray);
- /* Attenuation from transparent surface. */
- if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
- path_state_modify_bounce(state, true);
- shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
- path_state_modify_bounce(state, false);
- *throughput *= shader_bsdf_transparency(kg, shadow_sd);
- }
- /* Stop if all light is blocked. */
- if (is_zero(*throughput)) {
- return true;
- }
-#ifdef __VOLUME__
- /* Exit/enter volume. */
- kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
-#endif
- return false;
-}
-
-/* Special version which only handles opaque shadows. */
-ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- const uint visibility,
- Ray *ray,
- Intersection *isect,
- float3 *shadow)
-{
- const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-#ifdef __VOLUME__
- if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
- /* Apply attenuation from current volume shader. */
- kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
- }
-#endif
- return blocked;
-}
-
-#ifdef __TRANSPARENT_SHADOWS__
-# ifdef __SHADOW_RECORD_ALL__
-/* Shadow function to compute how much light is blocked,
- *
- * We trace a single ray. If it hits any opaque surface, or more than a given
- * number of transparent surfaces is hit, then we consider the geometry to be
- * entirely blocked. If not, all transparent surfaces will be recorded and we
- * will shade them one by one to determine how much light is blocked. This all
- * happens in one scene intersection function.
- *
- * Recording all hits works well in some cases but may be slower in others. If
- * we have many semi-transparent hairs, one intersection may be faster because
- * you'd be reinteresecting the same hairs a lot with each step otherwise. If
- * however there is mostly binary transparency then we may be recording many
- * unnecessary intersections when one of the first surfaces blocks all light.
- *
- * From tests in real scenes it seems the performance loss is either minimal,
- * or there is a performance increase anyway due to avoiding the need to send
- * two rays with transparent shadows.
- *
- * On CPU it'll handle all transparent bounces (by allocating storage for
- * intersections when they don't fit into the stack storage).
- *
- * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
- * is something to be kept an eye on.
- */
-
-# define SHADOW_STACK_MAX_HITS 64
-
-/* Actual logic with traversal loop implementation which is free from device
- * specific tweaks.
- *
- * Note that hits array should be as big as max_hits+1.
- */
-ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- const uint visibility,
- Ray *ray,
- Intersection *hits,
- uint max_hits,
- float3 *shadow)
-{
- /* Intersect to find an opaque surface, or record all transparent
- * surface hits.
- */
- uint num_hits;
- const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
-# ifdef __VOLUME__
-# ifdef __KERNEL_OPTIX__
- VolumeState &volume_state = kg->volume_state;
-# else
- VolumeState volume_state;
-# endif
-# endif
- /* If no opaque surface found but we did find transparent hits,
- * shade them.
- */
- if (!blocked && num_hits > 0) {
- float3 throughput = one_float3();
- float3 Pend = ray->P + ray->D * ray->t;
- float last_t = 0.0f;
- int bounce = state->transparent_bounce;
- Intersection *isect = hits;
-# ifdef __VOLUME__
-# ifdef __SPLIT_KERNEL__
- ccl_addr_space
-# endif
- PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-# endif
- sort_intersections(hits, num_hits);
- for (int hit = 0; hit < num_hits; hit++, isect++) {
- /* Adjust intersection distance for moving ray forward. */
- float new_t = isect->t;
- isect->t -= last_t;
- /* Skip hit if we did not move forward, step by step raytracing
- * would have skipped it as well then.
- */
- if (last_t == new_t) {
- continue;
- }
- last_t = new_t;
- /* Attenuate the throughput. */
- if (shadow_handle_transparent_isect(kg,
- shadow_sd,
- state,
-# ifdef __VOLUME__
- ps,
-# endif
- isect,
- ray,
- &throughput)) {
- return true;
- }
- /* Move ray forward. */
- ray->P = shadow_sd->P;
- if (ray->t != FLT_MAX) {
- ray->D = normalize_len(Pend - ray->P, &ray->t);
- }
- bounce++;
- }
-# ifdef __VOLUME__
- /* Attenuation for last line segment towards light. */
- if (ps->volume_stack[0].shader != SHADER_NONE) {
- kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
- }
-# endif
- *shadow = throughput;
- return is_zero(throughput);
- }
-# ifdef __VOLUME__
- if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
- /* Apply attenuation from current volume shader. */
-# ifdef __SPLIT_KERNEL__
- ccl_addr_space
-# endif
- PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
- kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
- }
-# endif
- return blocked;
-}
-
-/* Here we do all device specific trickery before invoking actual traversal
- * loop to help readability of the actual logic.
- */
-ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- const uint visibility,
- Ray *ray,
- uint max_hits,
- float3 *shadow)
-{
-# ifdef __SPLIT_KERNEL__
- Intersection hits_[SHADOW_STACK_MAX_HITS];
- Intersection *hits = &hits_[0];
-# elif defined(__KERNEL_CUDA__)
- Intersection *hits = kg->hits_stack;
-# else
- Intersection hits_stack[SHADOW_STACK_MAX_HITS];
- Intersection *hits = hits_stack;
-# endif
-# ifndef __KERNEL_GPU__
- /* Prefer to use stack but use dynamic allocation if too deep max hits
- * we need max_hits + 1 storage space due to the logic in
- * scene_intersect_shadow_all which will first store and then check if
- * the limit is exceeded.
- *
- * Ignore this on GPU because of slow/unavailable malloc().
- */
- if (max_hits + 1 > SHADOW_STACK_MAX_HITS) {
- if (kg->transparent_shadow_intersections == NULL) {
- const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
- kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) *
- (transparent_max_bounce + 1));
- }
- hits = kg->transparent_shadow_intersections;
- }
-# endif /* __KERNEL_GPU__ */
- /* Invoke actual traversal. */
- return shadow_blocked_transparent_all_loop(
- kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow);
-}
-# endif /* __SHADOW_RECORD_ALL__ */
-
-# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
-/* Shadow function to compute how much light is blocked,
- *
- * Here we raytrace from one transparent surface to the next step by step.
- * To minimize overhead in cases where we don't need transparent shadows, we
- * first trace a regular shadow ray. We check if the hit primitive was
- * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency.
- */
-
-/* This function is only implementing device-independent traversal logic
- * which requires some precalculation done.
- */
-ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- const uint visibility,
- Ray *ray,
- Intersection *isect,
- const bool blocked,
- const bool is_transparent_isect,
- float3 *shadow)
-{
-# ifdef __VOLUME__
-# ifdef __KERNEL_OPTIX__
- VolumeState &volume_state = kg->volume_state;
-# else
- VolumeState volume_state;
-# endif
-# endif
- if (blocked && is_transparent_isect) {
- float3 throughput = one_float3();
- float3 Pend = ray->P + ray->D * ray->t;
- int bounce = state->transparent_bounce;
-# ifdef __VOLUME__
-# ifdef __SPLIT_KERNEL__
- ccl_addr_space
-# endif
- PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-# endif
- for (;;) {
- if (bounce >= kernel_data.integrator.transparent_max_bounce) {
- return true;
- }
- if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
- break;
- }
- if (!shader_transparent_shadow(kg, isect)) {
- return true;
- }
- /* Attenuate the throughput. */
- if (shadow_handle_transparent_isect(kg,
- shadow_sd,
- state,
-# ifdef __VOLUME__
- ps,
-# endif
- isect,
- ray,
- &throughput)) {
- return true;
- }
- /* Move ray forward. */
- ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
- if (ray->t != FLT_MAX) {
- ray->D = normalize_len(Pend - ray->P, &ray->t);
- }
- bounce++;
- }
-# ifdef __VOLUME__
- /* Attenuation for last line segment towards light. */
- if (ps->volume_stack[0].shader != SHADER_NONE) {
- kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
- }
-# endif
- *shadow *= throughput;
- return is_zero(throughput);
- }
-# ifdef __VOLUME__
- if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
- /* Apply attenuation from current volume shader. */
-# ifdef __SPLIT_KERNEL__
- ccl_addr_space
-# endif
- PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
- kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
- }
-# endif
- return blocked;
-}
-
-ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- const uint visibility,
- Ray *ray,
- Intersection *isect,
- float3 *shadow)
-{
- bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
- bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
- return shadow_blocked_transparent_stepped_loop(
- kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
-}
-
-# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif /* __TRANSPARENT_SHADOWS__ */
-
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
- ShaderData *sd,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- Ray *ray,
- float3 *shadow)
-{
- *shadow = one_float3();
-#if !defined(__KERNEL_OPTIX__)
- /* Some common early checks.
- * Avoid conditional trace call in OptiX though, since those hurt performance there.
- */
- if (ray->t == 0.0f) {
- return false;
- }
-#endif
-#ifdef __SHADOW_TRICKS__
- const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
- PATH_RAY_SHADOW;
-#else
- const uint visibility = PATH_RAY_SHADOW;
-#endif
- /* Do actual shadow shading.
- * First of all, we check if integrator requires transparent shadows.
- * if not, we use simplest and fastest ever way to calculate occlusion.
- * Do not do this in OptiX to avoid the additional trace call.
- */
-#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
- Intersection isect;
-# ifdef __TRANSPARENT_SHADOWS__
- if (!kernel_data.integrator.transparent_shadows)
-# endif
- {
- return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
- }
-#endif
-#ifdef __TRANSPARENT_SHADOWS__
-# ifdef __SHADOW_RECORD_ALL__
- /* For the transparent shadows we try to use record-all logic on the
- * devices which supports this.
- */
- const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
- /* Check transparent bounces here, for volume scatter which can do
- * lighting before surface path termination is checked.
- */
- if (state->transparent_bounce >= transparent_max_bounce) {
- return true;
- }
- uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-# if defined(__KERNEL_OPTIX__)
- /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
- * accesses to the hit stack.
- */
- max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
-# elif defined(__KERNEL_GPU__)
- /* On GPU we do tricky with tracing opaque ray first, this avoids speed
- * regressions in some files.
- *
- * TODO(sergey): Check why using record-all behavior causes slowdown in such
- * cases. Could that be caused by a higher spill pressure?
- */
- const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
- const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
- if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
- return shadow_blocked_transparent_stepped_loop(
- kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow);
- }
-# endif /* __KERNEL_GPU__ */
- return shadow_blocked_transparent_all(
- kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow);
-# else /* __SHADOW_RECORD_ALL__ */
- /* Fallback to a slowest version which works on all devices. */
- return shadow_blocked_transparent_stepped(
- kg, sd, shadow_sd, state, visibility, ray, &isect, shadow);
-# endif /* __SHADOW_RECORD_ALL__ */
-#endif /* __TRANSPARENT_SHADOWS__ */
-}
-
-#undef SHADOW_STACK_MAX_HITS
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
new file mode 100644
index 00000000000..824749818a4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/kernel_path_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS,
+ const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+ if (!kernel_data.integrator.has_shadow_catcher) {
+ return false;
+ }
+
+ /* Check the flag first, avoiding fetches form global memory. */
+ if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+ return false;
+ }
+ if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+ return false;
+ }
+
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+
+ if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+ /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+ * object. */
+ return false;
+ }
+
+ if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+ return false;
+ }
+
+ return true;
+#else
+ (void)object_flag;
+ return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS)
+{
+ if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+ return false;
+ }
+
+ const int path_flag = INTEGRATOR_STATE(path, flag);
+
+ if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+ /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+ return false;
+ }
+
+ return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+ if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) {
+ return false;
+ }
+
+ /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+ * shadow catcher matte pass, but keeps contributing to the combined pass. */
+ INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+ /* Split new state from the current one. This new state will only track contribution of shadow
+ * catcher objects ignoring non-catcher objects. */
+ integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS);
+
+ return true;
+#else
+ (void)object_flags;
+ return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS)
+{
+ return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS)
+{
+ return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
deleted file mode 100644
index 677504a4045..00000000000
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* BSSRDF using disk based importance sampling.
- *
- * BSSRDF Importance Sampling, SIGGRAPH 2013
- * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- */
-
-ccl_device_inline float3
-subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
-{
- /* This is the Veach one-sample model with balance heuristic, some pdf
- * factors drop out when using balance heuristic weighting. For branched
- * path tracing (all) we sample all closure and don't use MIS. */
- float3 eval_sum = zero_float3();
- float pdf_sum = 0.0f;
- float sample_weight_inv = 0.0f;
-
- if (!all) {
- float sample_weight_sum = 0.0f;
-
- for (int i = 0; i < sd->num_closure; i++) {
- sc = &sd->closure[i];
-
- if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
- sample_weight_sum += sc->sample_weight;
- }
- }
-
- sample_weight_inv = 1.0f / sample_weight_sum;
- }
-
- for (int i = 0; i < sd->num_closure; i++) {
- sc = &sd->closure[i];
-
- if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
- /* in case of branched path integrate we sample all bssrdf's once,
- * for path trace we pick one, so adjust pdf for that */
- float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv;
-
- /* compute pdf */
- float3 eval = bssrdf_eval(sc, r);
- float pdf = bssrdf_pdf(sc, disk_r);
-
- eval_sum += sc->weight * eval;
- pdf_sum += sample_weight * pdf;
- }
- }
-
- return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
-}
-
-ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
- const ShaderClosure *sc,
- float3 throughput,
- bool all)
-{
- /* This is the Veach one-sample model with balance heuristic, some pdf
- * factors drop out when using balance heuristic weighting. For branched
- * path tracing (all) we sample all closure and don't use MIS. */
- if (!all) {
- float bssrdf_weight = 0.0f;
- float weight = sc->sample_weight;
-
- for (int i = 0; i < sd->num_closure; i++) {
- sc = &sd->closure[i];
-
- if (CLOSURE_IS_BSSRDF(sc->type)) {
- bssrdf_weight += sc->sample_weight;
- }
- }
- throughput *= bssrdf_weight / weight;
- }
- return throughput;
-}
-
-/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(
- KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
-{
- sd->flag &= ~SD_CLOSURE_FLAGS;
- sd->num_closure = 0;
- sd->num_closure_left = kernel_data.integrator.max_closures;
-
-#ifdef __PRINCIPLED__
- if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
- PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
- sd, sizeof(PrincipledDiffuseBsdf), weight);
-
- if (bsdf) {
- bsdf->N = N;
- bsdf->roughness = roughness;
- sd->flag |= bsdf_principled_diffuse_setup(bsdf);
-
- /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
- * can recognize it as not being a regular Disney principled diffuse closure */
- bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
- }
- }
- else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type))
-#endif /* __PRINCIPLED__ */
- {
- DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
- if (bsdf) {
- bsdf->N = N;
- sd->flag |= bsdf_diffuse_setup(bsdf);
-
- /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
- * can recognize it as not being a regular diffuse closure */
- bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
- }
- }
-}
-
-/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-ccl_device float3 subsurface_color_pow(float3 color, float exponent)
-{
- color = max(color, zero_float3());
-
- if (exponent == 1.0f) {
- /* nothing to do */
- }
- else if (exponent == 0.5f) {
- color.x = sqrtf(color.x);
- color.y = sqrtf(color.y);
- color.z = sqrtf(color.z);
- }
- else {
- color.x = powf(color.x, exponent);
- color.y = powf(color.y, exponent);
- color.z = powf(color.z, exponent);
- }
-
- return color;
-}
-
-ccl_device void subsurface_color_bump_blur(
- KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N)
-{
- /* average color and texture blur at outgoing point */
- float texture_blur;
- float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
-
- /* do we have bump mapping? */
- bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
-
- if (bump || texture_blur > 0.0f) {
- /* average color and normal at incoming point */
- shader_eval_surface(kg, sd, state, NULL, state->flag);
- float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
-
- /* we simply divide out the average color and multiply with the average
- * of the other one. we could try to do this per closure but it's quite
- * tricky to match closures between shader evaluations, their number and
- * order may change, this is simpler */
- if (texture_blur > 0.0f) {
- out_color = subsurface_color_pow(out_color, texture_blur);
- in_color = subsurface_color_pow(in_color, texture_blur);
-
- *eval *= safe_divide_color(in_color, out_color);
- }
- }
-}
-
-/* Subsurface scattering step, from a point on the surface to other
- * nearby points on the same object.
- */
-ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
- LocalIntersection *ss_isect,
- ShaderData *sd,
- const ShaderClosure *sc,
- uint *lcg_state,
- float disk_u,
- float disk_v,
- bool all)
-{
- /* pick random axis in local frame and point on disk */
- float3 disk_N, disk_T, disk_B;
- float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-
- disk_N = sd->Ng;
- make_orthonormals(disk_N, &disk_T, &disk_B);
-
- if (disk_v < 0.5f) {
- pick_pdf_N = 0.5f;
- pick_pdf_T = 0.25f;
- pick_pdf_B = 0.25f;
- disk_v *= 2.0f;
- }
- else if (disk_v < 0.75f) {
- float3 tmp = disk_N;
- disk_N = disk_T;
- disk_T = tmp;
- pick_pdf_N = 0.25f;
- pick_pdf_T = 0.5f;
- pick_pdf_B = 0.25f;
- disk_v = (disk_v - 0.5f) * 4.0f;
- }
- else {
- float3 tmp = disk_N;
- disk_N = disk_B;
- disk_B = tmp;
- pick_pdf_N = 0.25f;
- pick_pdf_T = 0.25f;
- pick_pdf_B = 0.5f;
- disk_v = (disk_v - 0.75f) * 4.0f;
- }
-
- /* sample point on disk */
- float phi = M_2PI_F * disk_v;
- float disk_height, disk_r;
-
- bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
-
- float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
-
- /* create ray */
-#ifdef __SPLIT_KERNEL__
- Ray ray_object = ss_isect->ray;
- Ray *ray = &ray_object;
-#else
- Ray *ray = &ss_isect->ray;
-#endif
- ray->P = sd->P + disk_N * disk_height + disk_P;
- ray->D = -disk_N;
- ray->t = 2.0f * disk_height;
- ray->dP = sd->dP;
- ray->dD = differential3_zero();
- ray->time = sd->time;
-
- /* intersect with the same object. if multiple intersections are found it
- * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
- scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
- int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
-
- for (int hit = 0; hit < num_eval_hits; hit++) {
- /* Quickly retrieve P and Ng without setting up ShaderData. */
- float3 hit_P;
- if (sd->type & PRIMITIVE_TRIANGLE) {
- hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray);
- }
-#ifdef __OBJECT_MOTION__
- else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
- float3 verts[3];
- motion_triangle_vertices(kg,
- sd->object,
- kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
- sd->time,
- verts);
- hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts);
- }
-#endif /* __OBJECT_MOTION__ */
- else {
- ss_isect->weight[hit] = zero_float3();
- continue;
- }
-
- float3 hit_Ng = ss_isect->Ng[hit];
- if (ss_isect->hits[hit].object != OBJECT_NONE) {
- object_normal_transform(kg, sd, &hit_Ng);
- }
-
- /* Probability densities for local frame axes. */
- float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
- float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
- float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
-
- /* Multiple importance sample between 3 axes, power heuristic
- * found to be slightly better than balance heuristic. pdf_N
- * in the MIS weight and denominator cancelled out. */
- float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
- if (ss_isect->num_hits > BSSRDF_MAX_HITS) {
- w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS;
- }
-
- /* Real distance to sampled point. */
- float r = len(hit_P - sd->P);
-
- /* Evaluate profiles. */
- float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
-
- ss_isect->weight[hit] = eval;
- }
-
-#ifdef __SPLIT_KERNEL__
- ss_isect->ray = *ray;
-#endif
-
- return num_eval_hits;
-}
-
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
- LocalIntersection *ss_isect,
- int hit,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ClosureType type,
- float roughness)
-{
- optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
-}
-extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
-#else
-ccl_device_noinline void subsurface_scatter_multi_setup(
-#endif
- KernelGlobals *kg,
- LocalIntersection *ss_isect,
- int hit,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ClosureType type,
- float roughness)
-{
-#ifdef __SPLIT_KERNEL__
- Ray ray_object = ss_isect->ray;
- Ray *ray = &ray_object;
-#else
- Ray *ray = &ss_isect->ray;
-#endif
-
- /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
-#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
- kernel_split_params.dummy_sd_flag = sd->flag;
-#endif
-
- /* Setup new shading point. */
- shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
-
- /* Optionally blur colors and bump mapping. */
- float3 weight = ss_isect->weight[hit];
- float3 N = sd->N;
- subsurface_color_bump_blur(kg, sd, state, &weight, &N);
-
- /* Setup diffuse BSDF. */
- subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N);
-}
-
-/* Random walk subsurface scattering.
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-
-ccl_device void subsurface_random_walk_remap(const float A,
- const float d,
- float *sigma_t,
- float *alpha)
-{
- /* Compute attenuation and scattering coefficients from albedo. */
- *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
- const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
-
- *sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
-}
-
-ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
- float3 *sigma_t,
- float3 *alpha,
- float3 *weight)
-{
- const Bssrdf *bssrdf = (const Bssrdf *)sc;
- const float3 A = bssrdf->albedo;
- const float3 d = bssrdf->radius;
- float sigma_t_x, sigma_t_y, sigma_t_z;
- float alpha_x, alpha_y, alpha_z;
-
- subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x);
- subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y);
- subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z);
-
- *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
- *alpha = make_float3(alpha_x, alpha_y, alpha_z);
-
- /* Closure mixing and Fresnel weights separate from albedo. */
- *weight = safe_divide_color(bssrdf->weight, A);
-}
-
-/* References for Dwivedi sampling:
- *
- * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
- * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
- * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
- *
- * [2] "Improving the Dwivedi Sampling Scheme"
- * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
- * https://cg.ivd.kit.edu/1951.php
- *
- * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
- * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
- * https://iliyan.com/publications/RenderingCourse2020
- */
-
-ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
-{
- /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
- return 1.0f / ((v - cos_theta) * phase_log);
-}
-
-ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
-{
- /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
- * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
- * we can implement the power function like this. */
- return v - (v + 1) * expf(-rand * phase_log);
-}
-
-ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
-{
- /* Eq. 67 from [3] */
- return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
-}
-
-ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
-{
- float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
- float phi = M_2PI_F * randv;
- float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
-
- float3 T, B;
- make_orthonormals(D, &T, &B);
- return dir.x * T + dir.y * B + dir.z * D;
-}
-
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
- float t,
- bool hit,
- float3 *transmittance)
-{
- float3 T = volume_color_transmittance(sigma_t, t);
- if (transmittance) {
- *transmittance = T;
- }
- return hit ? T : sigma_t * T;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
- bool
- subsurface_random_walk(KernelGlobals *kg,
- LocalIntersection *ss_isect,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- const ShaderClosure *sc,
- const float bssrdf_u,
- const float bssrdf_v,
- bool all)
-{
- /* Sample diffuse surface scatter into the object. */
- float3 D;
- float pdf;
- sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
- if (dot(-sd->Ng, D) <= 0.0f) {
- return 0;
- }
-
- /* Convert subsurface to volume coefficients.
- * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
- float3 sigma_t, alpha;
- float3 throughput = one_float3();
- subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput);
- float3 sigma_s = sigma_t * alpha;
-
- /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
- * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
- * for making the code significantly more complex and slower (if direction sampling depends on
- * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
- *
- * Since the strength of the guided sampling increases as alpha gets lower, using a value that
- * is too low results in fireflies while one that's too high just gives a bit more noise.
- * Therefore, the code here uses the highest of the three albedos to be safe. */
- float diffusion_length = diffusion_length_dwivedi(max3(alpha));
- /* Precompute term for phase sampling. */
- float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1));
-
- /* Setup ray. */
-#ifdef __SPLIT_KERNEL__
- Ray ray_object = ss_isect->ray;
- Ray *ray = &ray_object;
-#else
- Ray *ray = &ss_isect->ray;
-#endif
- ray->P = ray_offset(sd->P, -sd->Ng);
- ray->D = D;
- ray->t = FLT_MAX;
- ray->time = sd->time;
-
- /* Modify state for RNGs, decorrelated from other paths. */
- uint prev_rng_offset = state->rng_offset;
- uint prev_rng_hash = state->rng_hash;
- state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
-
- /* Random walk until we hit the surface again. */
- bool hit = false;
- bool have_opposite_interface = false;
- float opposite_distance = 0.0f;
-
- /* Todo: Disable for alpha>0.999 or so? */
- const float guided_fraction = 0.75f;
-
- for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
- /* Advance random number offset. */
- state->rng_offset += PRNG_BOUNCE_NUM;
-
- /* Sample color channel, use MIS with balance heuristic. */
- float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
- float3 channel_pdf;
- int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
- float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
- float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
- /* We need the result of the raycast to compute the full guided PDF, so just remember the
- * relevant terms to avoid recomputing them later. */
- float backward_fraction = 0.0f;
- float forward_pdf_factor = 0.0f;
- float forward_stretching = 1.0f;
- float backward_pdf_factor = 0.0f;
- float backward_stretching = 1.0f;
-
- /* For the initial ray, we already know the direction, so just do classic distance sampling. */
- if (bounce > 0) {
- /* Decide whether we should use guided or classic sampling. */
- bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction);
-
- /* Determine if we want to sample away from the incoming interface.
- * This only happens if we found a nearby opposite interface, and the probability for it
- * depends on how close we are to it already.
- * This probability term comes from the recorded presentation of [3]. */
- bool guide_backward = false;
- if (have_opposite_interface) {
- /* Compute distance of the random walk between the tangent plane at the starting point
- * and the assumed opposite interface (the parallel plane that contains the point we
- * found in our ray query for the opposite side). */
- float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance);
- backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length));
- guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction;
- }
-
- /* Sample scattering direction. */
- float scatter_u, scatter_v;
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
- float cos_theta;
- if (guided) {
- cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
- /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
- * sign here is enough to sample from that instead. */
- if (guide_backward) {
- cos_theta = -cos_theta;
- }
- }
- else {
- cos_theta = 2.0f * scatter_u - 1.0f;
- }
- ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v);
-
- /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
- * Since phase sampling is channel-independent, we can get away with applying a factor
- * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
- * it cancel with an equivalent term in the numerator of the full estimator.
- * For the backward PDF, we again reuse the same probability distribution with a sign swap.
- */
- forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta);
- backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta);
-
- /* Prepare distance sampling.
- * For the backwards case, this also needs the sign swapped since now directions against
- * sd->N (and therefore with negative cos_theta) are preferred. */
- forward_stretching = (1.0f - cos_theta / diffusion_length);
- backward_stretching = (1.0f + cos_theta / diffusion_length);
- if (guided) {
- sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
- }
- }
-
- /* Sample direction along ray. */
- float t = -logf(1.0f - randt) / sample_sigma_t;
-
- /* On the first bounce, we use the raycast to check if the opposite side is nearby.
- * If yes, we will later use backwards guided sampling in order to have a decent
- * chance of connecting to it.
- * Todo: Maybe use less than 10 times the mean free path? */
- ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
- scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
- hit = (ss_isect->num_hits > 0);
-
- if (hit) {
-#ifdef __KERNEL_OPTIX__
- /* t is always in world space with OptiX. */
- ray->t = ss_isect->hits[0].t;
-#else
- /* Compute world space distance to surface hit. */
- float3 D = ray->D;
- object_inverse_dir_transform(kg, sd, &D);
- D = normalize(D) * ss_isect->hits[0].t;
- object_dir_transform(kg, sd, &D);
- ray->t = len(D);
-#endif
- }
-
- if (bounce == 0) {
- /* Check if we hit the opposite side. */
- if (hit) {
- have_opposite_interface = true;
- opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N);
- }
- /* Apart from the opposite side check, we were supposed to only trace up to distance t,
- * so check if there would have been a hit in that case. */
- hit = ray->t < t;
- }
-
- /* Use the distance to the exit point for the throughput update if we found one. */
- if (hit) {
- t = ray->t;
- }
- else if (bounce == 0) {
- /* Restore original position if nothing was hit after the first bounce,
- * without the ray_offset() that was added to avoid self-intersection.
- * Otherwise if that offset is relatively large compared to the scattering
- * radius, we never go back up high enough to exit the surface. */
- ray->P = sd->P;
- }
-
- /* Advance to new scatter location. */
- ray->P += t * ray->D;
-
- float3 transmittance;
- float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
- if (bounce > 0) {
- /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
- float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
-
- if (have_opposite_interface) {
- /* First step of MIS: Depending on geometry we might have two methods for guided
- * sampling, so perform MIS between them. */
- float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
- guided_pdf = mix(
- guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
- }
- else {
- /* Just include phase sampling factor otherwise. */
- guided_pdf *= forward_pdf_factor;
- }
-
- /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
- pdf = mix(pdf, guided_pdf, guided_fraction);
- }
-
- /* Finally, we're applying MIS again to combine the three color channels.
- * Altogether, the MIS computation combines up to nine different estimators:
- * {classic, guided, backward_guided} x {r, g, b} */
- throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
-
- if (hit) {
- /* If we hit the surface, we are done. */
- break;
- }
- else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
- throughput.y < VOLUME_THROUGHPUT_EPSILON &&
- throughput.z < VOLUME_THROUGHPUT_EPSILON) {
- /* Avoid unnecessary work and precision issue when throughput gets really small. */
- break;
- }
- }
-
- kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) &&
- isfinite_safe(throughput.z));
-
- state->rng_offset = prev_rng_offset;
- state->rng_hash = prev_rng_hash;
-
- /* Return number of hits in ss_isect. */
- if (!hit) {
- return 0;
- }
-
- /* TODO: gain back performance lost from merging with disk BSSRDF. We
- * only need to return on hit so this indirect ray push/pop overhead
- * is not actually needed, but it does keep the code simpler. */
- ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
-#ifdef __SPLIT_KERNEL__
- ss_isect->ray = *ray;
-#endif
-
- return 1;
-}
-
-ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
- LocalIntersection *ss_isect,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- const ShaderClosure *sc,
- uint *lcg_state,
- float bssrdf_u,
- float bssrdf_v,
- bool all)
-{
- if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
- return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
- }
- else {
- return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index c8e01677d09..bf9b94c1753 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
KERNEL_TEX(float, __lookup_table)
/* sobol */
-KERNEL_TEX(uint, __sample_pattern_lut)
+KERNEL_TEX(float, __sample_pattern_lut)
/* image textures */
KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 7cbe18acf28..66b7310ab65 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_TYPES_H__
-#define __KERNEL_TYPES_H__
+#pragma once
#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
# include <embree3/rtcore.h>
@@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN
#define PRIM_NONE (~0)
#define LAMP_NONE (~0)
#define ID_NONE (0.0f)
+#define PASS_UNUSED (~0)
-#define VOLUME_STACK_SIZE 32
-
-/* Split kernel constants */
-#define WORK_POOL_SIZE_GPU 64
-#define WORK_POOL_SIZE_CPU 1
-#ifdef __KERNEL_GPU__
-# define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
-#else
-# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
-#endif
-
-#define SHADER_SORT_BLOCK_SIZE 2048
-
-#ifdef __KERNEL_OPENCL__
-# define SHADER_SORT_LOCAL_SIZE 64
-#elif defined(__KERNEL_CUDA__)
-# define SHADER_SORT_LOCAL_SIZE 32
-#else
-# define SHADER_SORT_LOCAL_SIZE 1
-#endif
+#define VOLUME_STACK_SIZE 4
/* Kernel features */
#define __SOBOL__
@@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN
#define __INTERSECTION_REFINE__
#define __CLAMP_SAMPLE__
#define __PATCH_EVAL__
-#define __SHADOW_TRICKS__
+#define __SHADOW_CATCHER__
#define __DENOISING_FEATURES__
#define __SHADER_RAYTRACE__
#define __AO__
@@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN
#define __SVM__
#define __EMISSION__
#define __HOLDOUT__
-#define __MULTI_CLOSURE__
#define __TRANSPARENT_SHADOWS__
#define __BACKGROUND_MIS__
#define __LAMP_MIS__
@@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN
#define __PRINCIPLED__
#define __SUBSURFACE__
#define __VOLUME__
-#define __VOLUME_SCATTER__
#define __CMJ__
#define __SHADOW_RECORD_ALL__
#define __BRANCHED_PATH__
@@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN
# ifdef WITH_OSL
# define __OSL__
# endif
-# define __VOLUME_DECOUPLED__
# define __VOLUME_RECORD_ALL__
#endif /* __KERNEL_CPU__ */
-#ifdef __KERNEL_CUDA__
-# ifdef __SPLIT_KERNEL__
-# undef __BRANCHED_PATH__
-# endif
-#endif /* __KERNEL_CUDA__ */
-
#ifdef __KERNEL_OPTIX__
# undef __BAKING__
-# undef __BRANCHED_PATH__
#endif /* __KERNEL_OPTIX__ */
-#ifdef __KERNEL_OPENCL__
-#endif /* __KERNEL_OPENCL__ */
-
/* Scene-based selective features compilation. */
-#ifdef __NO_CAMERA_MOTION__
-# undef __CAMERA_MOTION__
-#endif
-#ifdef __NO_OBJECT_MOTION__
-# undef __OBJECT_MOTION__
-#endif
-#ifdef __NO_HAIR__
-# undef __HAIR__
-#endif
-#ifdef __NO_VOLUME__
-# undef __VOLUME__
-# undef __VOLUME_SCATTER__
-#endif
-#ifdef __NO_SUBSURFACE__
-# undef __SUBSURFACE__
-#endif
-#ifdef __NO_BAKING__
-# undef __BAKING__
-#endif
-#ifdef __NO_BRANCHED_PATH__
-# undef __BRANCHED_PATH__
-#endif
-#ifdef __NO_PATCH_EVAL__
-# undef __PATCH_EVAL__
-#endif
-#ifdef __NO_TRANSPARENT__
-# undef __TRANSPARENT_SHADOWS__
-#endif
-#ifdef __NO_SHADOW_TRICKS__
-# undef __SHADOW_TRICKS__
-#endif
-#ifdef __NO_PRINCIPLED__
-# undef __PRINCIPLED__
-#endif
-#ifdef __NO_DENOISING__
-# undef __DENOISING_FEATURES__
-#endif
-#ifdef __NO_SHADER_RAYTRACE__
-# undef __SHADER_RAYTRACE__
+#ifdef __KERNEL_FEATURES__
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
+# undef __CAMERA_MOTION__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
+# undef __OBJECT_MOTION__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR)
+# undef __HAIR__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME)
+# undef __VOLUME__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
+# undef __SUBSURFACE__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
+# undef __BAKING__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
+# undef __PATCH_EVAL__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT)
+# undef __TRANSPARENT_SHADOWS__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
+# undef __SHADOW_CATCHER__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
+# undef __PRINCIPLED__
+# endif
+# if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
+# undef __DENOISING_FEATURES__
+# endif
#endif
#ifdef WITH_CYCLES_DEBUG_NAN
# define __KERNEL_DEBUG_NAN__
#endif
+/* Features that enable others */
+
#if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
# define __BVH_LOCAL__
#endif
-/* Shader Evaluation */
-
-typedef enum ShaderEvalType {
- SHADER_EVAL_DISPLACE,
- SHADER_EVAL_BACKGROUND,
- /* bake types */
- SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
- * differentiate the type of shader eval from the above
- */
- /* data passes */
- SHADER_EVAL_NORMAL,
- SHADER_EVAL_UV,
- SHADER_EVAL_ROUGHNESS,
- SHADER_EVAL_DIFFUSE_COLOR,
- SHADER_EVAL_GLOSSY_COLOR,
- SHADER_EVAL_TRANSMISSION_COLOR,
- SHADER_EVAL_EMISSION,
- SHADER_EVAL_AOV_COLOR,
- SHADER_EVAL_AOV_VALUE,
-
- /* light passes */
- SHADER_EVAL_AO,
- SHADER_EVAL_COMBINED,
- SHADER_EVAL_SHADOW,
- SHADER_EVAL_DIFFUSE,
- SHADER_EVAL_GLOSSY,
- SHADER_EVAL_TRANSMISSION,
-
- /* extra */
- SHADER_EVAL_ENVIRONMENT,
-} ShaderEvalType;
-
/* Path Tracing
* note we need to keep the u/v pairs at even values */
@@ -252,8 +185,7 @@ enum PathTraceDimension {
enum SamplingPattern {
SAMPLING_PATTERN_SOBOL = 0,
- SAMPLING_PATTERN_CMJ = 1,
- SAMPLING_PATTERN_PMJ = 2,
+ SAMPLING_PATTERN_PMJ = 1,
SAMPLING_NUM_PATTERNS,
};
@@ -261,7 +193,12 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
- /* Ray visibility. */
+ /* --------------------------------------------------------------------
+ * Ray visibility.
+ *
+ * NOTE: Recalculated after a surface bounce.
+ */
+
PATH_RAY_CAMERA = (1 << 0),
PATH_RAY_REFLECT = (1 << 1),
PATH_RAY_TRANSMIT = (1 << 2),
@@ -269,57 +206,106 @@ enum PathRayFlag {
PATH_RAY_GLOSSY = (1 << 4),
PATH_RAY_SINGULAR = (1 << 5),
PATH_RAY_TRANSPARENT = (1 << 6),
+ PATH_RAY_VOLUME_SCATTER = (1 << 7),
/* Shadow ray visibility. */
- PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
- PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
- PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
- PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
- PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
- PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER |
- PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
- PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER |
- PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+ PATH_RAY_SHADOW_OPAQUE = (1 << 8),
+ PATH_RAY_SHADOW_TRANSPARENT = (1 << 9),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
- /* Unused, free to reuse. */
- PATH_RAY_UNUSED = (1 << 11),
+ /* Special flag to tag unaligned BVH nodes.
+ * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored
+ * in the node (either it should be intersected as AABB or as OBB). */
+ PATH_RAY_NODE_UNALIGNED = (1 << 10),
- /* Ray visibility for volume scattering. */
- PATH_RAY_VOLUME_SCATTER = (1 << 12),
-
- /* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = (1 << 13),
+ /* Subset of flags used for ray visibility for intersection.
+ *
+ * NOTE: SHADOW_CATCHER macros below assume there are no more than
+ * 16 visibility bits. */
+ PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1),
- PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1),
+ /* --------------------------------------------------------------------
+ * Path flags.
+ */
/* Don't apply multiple importance sampling weights to emission from
* lamp or surface hits, because they were not direct light sampled. */
- PATH_RAY_MIS_SKIP = (1 << 14),
+ PATH_RAY_MIS_SKIP = (1 << 11),
+
/* Diffuse bounce earlier in the path, skip SSS to improve performance
* and avoid branching twice with disk sampling SSS. */
- PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12),
+
/* Single pass has been written. */
- PATH_RAY_SINGLE_PASS_DONE = (1 << 16),
- /* Ray is behind a shadow catcher. */
- PATH_RAY_SHADOW_CATCHER = (1 << 17),
- /* Store shadow data for shadow catcher or denoising. */
- PATH_RAY_STORE_SHADOW_INFO = (1 << 18),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 13),
+
/* Zero background alpha, for camera or transparent glass rays. */
- PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19),
+ PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14),
+
/* Terminate ray immediately at next bounce. */
- PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20),
+ PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15),
+ PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16),
+
/* Ray is to be terminated, but continue with transparent bounces and
* emission as long as we encounter them. This is required to make the
* MIS between direct and indirect light rays match, as shadow rays go
* through transparent surfaces to reach emission too. */
- PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+ PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17),
+
+ /* Terminate ray immediately after volume shading. */
+ PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18),
+
/* Ray is to be terminated. */
- PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+ PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME |
+ PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME),
+
/* Path and shader is being evaluated for direct lighting emission. */
- PATH_RAY_EMISSION = (1 << 22)
+ PATH_RAY_EMISSION = (1 << 19),
+
+ /* Perform subsurface scattering. */
+ PATH_RAY_SUBSURFACE = (1 << 20),
+
+ /* Contribute to denoising features. */
+ PATH_RAY_DENOISING_FEATURES = (1 << 21),
+
+ /* Render pass categories. */
+ PATH_RAY_REFLECT_PASS = (1 << 22),
+ PATH_RAY_TRANSMISSION_PASS = (1 << 23),
+ PATH_RAY_VOLUME_PASS = (1 << 24),
+ PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+
+ /* Shadow ray is for a light or surface. */
+ PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25),
+
+ /* A shadow catcher object was hit and the path was split into two. */
+ PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26),
+
+ /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
+ * their dedicated pass for later division.
+ *
+ * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
+ * which is separate from the light passes. */
+ PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27),
+
+ /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
+ PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28),
};
+/* Configure ray visibility bits for rays and objects respectively,
+ * to make shadow catchers work.
+ *
+ * On shadow catcher paths we want to ignore any intersections with non-catchers,
+ * whereas on regular paths we want to intersect all objects. */
+
+#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16)
+
+#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \
+ (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \
+ (visibility))
+
+#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \
+ (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility))
+
/* Closure Label */
typedef enum ClosureLabel {
@@ -332,6 +318,7 @@ typedef enum ClosureLabel {
LABEL_TRANSPARENT = 32,
LABEL_VOLUME_SCATTER = 64,
LABEL_TRANSMIT_TRANSPARENT = 128,
+ LABEL_SUBSURFACE_SCATTER = 256,
} ClosureLabel;
/* Render Passes */
@@ -339,17 +326,35 @@ typedef enum ClosureLabel {
#define PASS_NAME_JOIN(a, b) a##_##b
#define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
-#define PASSMASK_COMPONENT(comp) \
- (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
- PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
-
+// NOTE: Keep in sync with `Pass::get_type_enum()`.
typedef enum PassType {
PASS_NONE = 0,
- /* Main passes */
+ /* Light Passes */
PASS_COMBINED = 1,
- PASS_DEPTH,
+ PASS_EMISSION,
+ PASS_BACKGROUND,
+ PASS_AO,
+ PASS_SHADOW,
+ PASS_DIFFUSE,
+ PASS_DIFFUSE_DIRECT,
+ PASS_DIFFUSE_INDIRECT,
+ PASS_GLOSSY,
+ PASS_GLOSSY_DIRECT,
+ PASS_GLOSSY_INDIRECT,
+ PASS_TRANSMISSION,
+ PASS_TRANSMISSION_DIRECT,
+ PASS_TRANSMISSION_INDIRECT,
+ PASS_VOLUME,
+ PASS_VOLUME_DIRECT,
+ PASS_VOLUME_INDIRECT,
+ PASS_CATEGORY_LIGHT_END = 31,
+
+ /* Data passes */
+ PASS_DEPTH = 32,
+ PASS_POSITION,
PASS_NORMAL,
+ PASS_ROUGHNESS,
PASS_UV,
PASS_OBJECT_ID,
PASS_MATERIAL_ID,
@@ -361,31 +366,35 @@ typedef enum PassType {
PASS_AOV_VALUE,
PASS_ADAPTIVE_AUX_BUFFER,
PASS_SAMPLE_COUNT,
- PASS_CATEGORY_MAIN_END = 31,
-
- PASS_MIST = 32,
- PASS_EMISSION,
- PASS_BACKGROUND,
- PASS_AO,
- PASS_SHADOW,
- PASS_LIGHT, /* no real pass, used to force use_light_pass */
- PASS_DIFFUSE_DIRECT,
- PASS_DIFFUSE_INDIRECT,
PASS_DIFFUSE_COLOR,
- PASS_GLOSSY_DIRECT,
- PASS_GLOSSY_INDIRECT,
PASS_GLOSSY_COLOR,
- PASS_TRANSMISSION_DIRECT,
- PASS_TRANSMISSION_INDIRECT,
PASS_TRANSMISSION_COLOR,
- PASS_VOLUME_DIRECT = 50,
- PASS_VOLUME_INDIRECT,
/* No Scatter color since it's tricky to define what it would even mean. */
- PASS_CATEGORY_LIGHT_END = 63,
+ PASS_MIST,
+ PASS_DENOISING_NORMAL,
+ PASS_DENOISING_ALBEDO,
+
+ /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
+ * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
+ * result of this division is then to be multiplied with the backdrop. The alpha channel of this
+ * pass contains number of samples which contributed to the color components of the pass.
+ *
+ * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split
+ * happened.
+ *
+ * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be
+ * alpha-overed onto the backdrop (after multiplication). */
+ PASS_SHADOW_CATCHER,
+ PASS_SHADOW_CATCHER_SAMPLE_COUNT,
+ PASS_SHADOW_CATCHER_MATTE,
+
+ PASS_CATEGORY_DATA_END = 63,
PASS_BAKE_PRIMITIVE,
PASS_BAKE_DIFFERENTIAL,
- PASS_CATEGORY_BAKE_END = 95
+ PASS_CATEGORY_BAKE_END = 95,
+
+ PASS_NUM,
} PassType;
#define PASS_ANY (~0)
@@ -398,158 +407,9 @@ typedef enum CryptomatteType {
CRYPT_ACCURATE = (1 << 3),
} CryptomatteType;
-typedef enum DenoisingPassOffsets {
- DENOISING_PASS_NORMAL = 0,
- DENOISING_PASS_NORMAL_VAR = 3,
- DENOISING_PASS_ALBEDO = 6,
- DENOISING_PASS_ALBEDO_VAR = 9,
- DENOISING_PASS_DEPTH = 12,
- DENOISING_PASS_DEPTH_VAR = 13,
- DENOISING_PASS_SHADOW_A = 14,
- DENOISING_PASS_SHADOW_B = 17,
- DENOISING_PASS_COLOR = 20,
- DENOISING_PASS_COLOR_VAR = 23,
- DENOISING_PASS_CLEAN = 26,
-
- DENOISING_PASS_PREFILTERED_DEPTH = 0,
- DENOISING_PASS_PREFILTERED_NORMAL = 1,
- DENOISING_PASS_PREFILTERED_SHADOWING = 4,
- DENOISING_PASS_PREFILTERED_ALBEDO = 5,
- DENOISING_PASS_PREFILTERED_COLOR = 8,
- DENOISING_PASS_PREFILTERED_VARIANCE = 11,
- DENOISING_PASS_PREFILTERED_INTENSITY = 14,
-
- DENOISING_PASS_SIZE_BASE = 26,
- DENOISING_PASS_SIZE_CLEAN = 3,
- DENOISING_PASS_SIZE_PREFILTERED = 15,
-} DenoisingPassOffsets;
-
-typedef enum eBakePassFilter {
- BAKE_FILTER_NONE = 0,
- BAKE_FILTER_DIRECT = (1 << 0),
- BAKE_FILTER_INDIRECT = (1 << 1),
- BAKE_FILTER_COLOR = (1 << 2),
- BAKE_FILTER_DIFFUSE = (1 << 3),
- BAKE_FILTER_GLOSSY = (1 << 4),
- BAKE_FILTER_TRANSMISSION = (1 << 5),
- BAKE_FILTER_EMISSION = (1 << 6),
- BAKE_FILTER_AO = (1 << 7),
-} eBakePassFilter;
-
-typedef enum BakePassFilterCombos {
- BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
- BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
- BAKE_FILTER_AO),
- BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
- BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
- BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
- BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
- BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
- BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-} BakePassFilterCombos;
-
-typedef enum DenoiseFlag {
- DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
- DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
- DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
- DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
- DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
- DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
- DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
-} DenoiseFlag;
-
-typedef ccl_addr_space struct PathRadianceState {
-#ifdef __PASSES__
- float3 diffuse;
- float3 glossy;
- float3 transmission;
- float3 volume;
-
- float3 direct;
-#endif
-} PathRadianceState;
-
-typedef ccl_addr_space struct PathRadiance {
-#ifdef __PASSES__
- int use_light_pass;
-#endif
-
- float transparent;
- float3 emission;
-#ifdef __PASSES__
- float3 background;
- float3 ao;
-
- float3 indirect;
- float3 direct_emission;
-
- float3 color_diffuse;
- float3 color_glossy;
- float3 color_transmission;
-
- float3 direct_diffuse;
- float3 direct_glossy;
- float3 direct_transmission;
- float3 direct_volume;
-
- float3 indirect_diffuse;
- float3 indirect_glossy;
- float3 indirect_transmission;
- float3 indirect_volume;
-
- float3 shadow;
- float mist;
-#endif
-
- struct PathRadianceState state;
-
-#ifdef __SHADOW_TRICKS__
- /* Total light reachable across the path, ignoring shadow blocked queries. */
- float3 path_total;
- /* Total light reachable across the path with shadow blocked queries
- * applied here.
- *
- * Dividing this figure by path_total will give estimate of shadow pass.
- */
- float3 path_total_shaded;
-
- /* Color of the background on which shadow is alpha-overed. */
- float3 shadow_background_color;
-
- /* Path radiance sum and throughput at the moment when ray hits shadow
- * catcher object.
- */
- float shadow_throughput;
-
- /* Accumulated transparency along the path after shadow catcher bounce. */
- float shadow_transparency;
-
- /* Indicate if any shadow catcher data is set. */
- int has_shadow_catcher;
-#endif
-
-#ifdef __DENOISING_FEATURES__
- float3 denoising_normal;
- float3 denoising_albedo;
- float denoising_depth;
-#endif /* __DENOISING_FEATURES__ */
-} PathRadiance;
-
typedef struct BsdfEval {
-#ifdef __PASSES__
- int use_light_pass;
-#endif
-
float3 diffuse;
-#ifdef __PASSES__
float3 glossy;
- float3 transmission;
- float3 transparent;
- float3 volume;
-#endif
-#ifdef __SHADOW_TRICKS__
- float3 sum_no_mis;
-#endif
} BsdfEval;
/* Shader Flag */
@@ -564,8 +424,10 @@ typedef enum ShaderFlag {
SHADER_EXCLUDE_TRANSMIT = (1 << 25),
SHADER_EXCLUDE_CAMERA = (1 << 24),
SHADER_EXCLUDE_SCATTER = (1 << 23),
+ SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22),
SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT |
- SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER),
+ SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER |
+ SHADER_EXCLUDE_SHADOW_CATCHER),
SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS |
SHADER_EXCLUDE_ANY)
@@ -612,29 +474,14 @@ typedef struct differential {
/* Ray */
typedef struct Ray {
-/* TODO(sergey): This is only needed because current AMD
- * compiler has hard time building the kernel with this
- * reshuffle. And at the same time reshuffle will cause
- * less optimal CPU code in certain places.
- *
- * We'll get rid of this nasty exception once AMD compiler
- * is fixed.
- */
-#ifndef __KERNEL_OPENCL_AMD__
float3 P; /* origin */
float3 D; /* direction */
float t; /* length of the ray */
float time; /* time (for motion blur) */
-#else
- float t; /* length of the ray */
- float time; /* time (for motion blur) */
- float3 P; /* origin */
- float3 D; /* direction */
-#endif
#ifdef __RAY_DIFFERENTIALS__
- differential3 dP;
- differential3 dD;
+ float dP;
+ float dD;
#endif
} Ray;
@@ -661,9 +508,6 @@ typedef enum PrimitiveType {
PRIMITIVE_CURVE_RIBBON = (1 << 4),
PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
PRIMITIVE_VOLUME = (1 << 6),
- /* Lamp primitive is not included below on purpose,
- * since it is no real traceable primitive.
- */
PRIMITIVE_LAMP = (1 << 7),
PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
@@ -672,16 +516,14 @@ typedef enum PrimitiveType {
PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
PRIMITIVE_MOTION_CURVE_RIBBON),
- PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME),
+ PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+ PRIMITIVE_LAMP),
- /* Total number of different traceable primitives.
- * NOTE: This is an actual value, not a bitflag.
- */
- PRIMITIVE_NUM_TOTAL = 7,
+ PRIMITIVE_NUM = 8,
} PrimitiveType;
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)
typedef enum CurveShapeType {
CURVE_RIBBON = 0,
@@ -760,20 +602,14 @@ typedef struct AttributeDescriptor {
/* Closure data */
-#ifdef __MULTI_CLOSURE__
-# ifdef __SPLIT_KERNEL__
-# define MAX_CLOSURE 1
-# else
-# ifndef __MAX_CLOSURE__
-# define MAX_CLOSURE 64
-# else
-# define MAX_CLOSURE __MAX_CLOSURE__
-# endif
-# endif
+#ifndef __MAX_CLOSURE__
+# define MAX_CLOSURE 64
#else
-# define MAX_CLOSURE 1
+# define MAX_CLOSURE __MAX_CLOSURE__
#endif
+#define MAX_VOLUME_CLOSURE 8
+
/* This struct is the base class for all closures. The common members are
* duplicated in all derived classes since we don't have C++ in the kernel
* yet, and because it lets us lay out the members to minimize padding. The
@@ -866,11 +702,14 @@ enum ShaderDataFlag {
SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
/* Shader has emission */
SD_HAS_EMISSION = (1 << 29),
+ /* Shader has raytracing */
+ SD_HAS_RAYTRACE = (1 << 30),
SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
- SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
+ SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION |
+ SD_HAS_RAYTRACE)
};
/* Object flags. */
@@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
#endif
#ifdef __OBJECT_MOTION__
- /* object <-> world space transformations, cached to avoid
- * re-interpolating them constantly for shading */
- Transform ob_tfm;
- Transform ob_itfm;
+ /* Object <-> world space transformations for motion blur, cached to avoid
+ * re-interpolating them constantly for shading. */
+ Transform ob_tfm_motion;
+ Transform ob_itfm_motion;
#endif
/* ray start position, only set for backgrounds */
float3 ray_P;
- differential3 ray_dP;
+ float ray_dP;
#ifdef __OSL__
- struct KernelGlobals *osl_globals;
- struct PathState *osl_path_state;
+ const struct KernelGlobals *osl_globals;
+ const struct IntegratorStateCPU *osl_path_state;
#endif
/* LCG state for closures that require additional random numbers. */
@@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
/* Closure data, we store a fixed array of closures */
int num_closure;
int num_closure_left;
- float randb_closure;
float3 svm_closure_weight;
/* Closure weights summed directly, so we can evaluate
@@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
ShaderDataTinyStorage;
#define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
-/* Path State */
+/* Compact volume closures storage.
+ *
+ * Used for decoupled direct/indirect light closure storage. */
+
+ccl_addr_space struct ShaderVolumeClosure {
+ float3 weight;
+ float sample_weight;
+ float g;
+};
+
+ccl_addr_space struct ShaderVolumePhases {
+ ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE];
+ int num_closure;
+};
+
+/* Volume Stack */
#ifdef __VOLUME__
typedef struct VolumeStack {
@@ -1007,53 +860,6 @@ typedef struct VolumeStack {
} VolumeStack;
#endif
-typedef struct PathState {
- /* see enum PathRayFlag */
- int flag;
-
- /* random number generator state */
- uint rng_hash; /* per pixel hash */
- int rng_offset; /* dimension offset */
- int sample; /* path sample number */
- int num_samples; /* total number of times this path will be sampled */
- float branch_factor; /* number of branches in indirect paths */
-
- /* bounce counting */
- int bounce;
- int diffuse_bounce;
- int glossy_bounce;
- int transmission_bounce;
- int transparent_bounce;
-
-#ifdef __DENOISING_FEATURES__
- float denoising_feature_weight;
- float3 denoising_feature_throughput;
-#endif /* __DENOISING_FEATURES__ */
-
- /* multiple importance sampling */
- float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
- float ray_pdf; /* last bounce pdf */
-#ifdef __LAMP_MIS__
- float ray_t; /* accumulated distance through transparent surfaces */
-#endif
-
- /* volume rendering */
-#ifdef __VOLUME__
- int volume_bounce;
- int volume_bounds_bounce;
- VolumeStack volume_stack[VOLUME_STACK_SIZE];
-#endif
-} PathState;
-
-#ifdef __VOLUME__
-typedef struct VolumeState {
-# ifdef __SPLIT_KERNEL__
-# else
- PathState ps;
-# endif
-} VolumeState;
-#endif
-
/* Struct to gather multiple nearby intersections. */
typedef struct LocalIntersection {
Ray ray;
@@ -1064,20 +870,6 @@ typedef struct LocalIntersection {
float3 Ng[LOCAL_MAX_HITS];
} LocalIntersection;
-/* Subsurface */
-
-/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays {
- PathState state[BSSRDF_MAX_HITS];
-
- int num_rays;
-
- struct Ray rays[BSSRDF_MAX_HITS];
- float3 throughputs[BSSRDF_MAX_HITS];
- struct PathRadianceState L_state[BSSRDF_MAX_HITS];
-} SubsurfaceIndirectRays;
-static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
-
/* Constant Kernel Data
*
* These structs are passed from CPU to various devices, and the struct layout
@@ -1128,7 +920,7 @@ typedef struct KernelCamera {
/* render size */
float width, height;
- int resolution;
+ int pad1;
/* anamorphic lens bokeh */
float inv_aperture_ratio;
@@ -1169,11 +961,12 @@ typedef struct KernelFilm {
int light_pass_flag;
int pass_stride;
- int use_light_pass;
int pass_combined;
int pass_depth;
+ int pass_position;
int pass_normal;
+ int pass_roughness;
int pass_motion;
int pass_motion_weight;
@@ -1202,7 +995,13 @@ typedef struct KernelFilm {
int pass_shadow;
float pass_shadow_scale;
+
+ int pass_shadow_catcher;
+ int pass_shadow_catcher_sample_count;
+ int pass_shadow_catcher_matte;
+
int filter_table_offset;
+
int cryptomatte_passes;
int cryptomatte_depth;
int pass_cryptomatte;
@@ -1215,15 +1014,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
- int pass_denoising_data;
- int pass_denoising_clean;
- int denoising_flags;
+ int pass_denoising_normal;
+ int pass_denoising_albedo;
int pass_aov_color;
int pass_aov_value;
- int pass_aov_color_num;
- int pass_aov_value_num;
- int pad1, pad2, pad3;
/* XYZ to rendering color space transform. float4 instead of float3 to
* ensure consistent padding/alignment across devices. */
@@ -1234,19 +1029,54 @@ typedef struct KernelFilm {
int pass_bake_primitive;
int pass_bake_differential;
- int pad;
- /* viewport rendering options */
- int display_pass_stride;
- int display_pass_components;
- int display_divide_pass_stride;
- int use_display_exposure;
- int use_display_pass_alpha;
+ int use_approximate_shadow_catcher;
- int pad4, pad5, pad6;
+ int pad1, pad2, pad3;
} KernelFilm;
static_assert_align(KernelFilm, 16);
+typedef struct KernelFilmConvert {
+ int pass_offset;
+ int pass_stride;
+
+ int pass_use_exposure;
+ int pass_use_filter;
+
+ int pass_divide;
+ int pass_indirect;
+
+ int pass_combined;
+ int pass_sample_count;
+ int pass_adaptive_aux_buffer;
+ int pass_motion_weight;
+ int pass_shadow_catcher;
+ int pass_shadow_catcher_sample_count;
+ int pass_shadow_catcher_matte;
+ int pass_background;
+
+ float scale;
+ float exposure;
+ float scale_exposure;
+
+ int use_approximate_shadow_catcher;
+ int use_approximate_shadow_catcher_background;
+ int show_active_pixels;
+
+ /* Number of components to write to. */
+ int num_components;
+
+ /* Number of floats per pixel. When zero is the same as `num_components`.
+ * NOTE: Is ignored for half4 destination. */
+ int pixel_stride;
+
+ int is_denoised;
+
+ /* Padding. */
+ int pad1;
+} KernelFilmConvert;
+static_assert_align(KernelFilmConvert, 16);
+
typedef struct KernelBackground {
/* only shader index */
int surface_shader;
@@ -1255,11 +1085,6 @@ typedef struct KernelBackground {
int transparent;
float transparent_roughness_squared_threshold;
- /* ambient occlusion */
- float ao_factor;
- float ao_distance;
- float ao_bounces_factor;
-
/* portal sampling */
float portal_weight;
int num_portals;
@@ -1277,13 +1102,15 @@ typedef struct KernelBackground {
int map_res_y;
int use_mis;
+
+ /* Padding */
+ int pad1, pad2, pad3;
} KernelBackground;
static_assert_align(KernelBackground, 16);
typedef struct KernelIntegrator {
/* emission */
int use_direct_light;
- int use_ambient_occlusion;
int num_distribution;
int num_all_lights;
float pdf_triangles;
@@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator {
int max_transmission_bounce;
int max_volume_bounce;
+ /* AO bounces */
int ao_bounces;
+ float ao_bounces_distance;
+ float ao_bounces_factor;
/* transparent */
int transparent_min_bounce;
@@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator {
float sample_clamp_direct;
float sample_clamp_indirect;
- /* branched path */
- int branched;
- int volume_decoupled;
- int diffuse_samples;
- int glossy_samples;
- int transmission_samples;
- int ao_samples;
- int mesh_light_samples;
- int subsurface_samples;
- int sample_all_lights_direct;
- int sample_all_lights_indirect;
-
/* mis */
int use_lamp_mis;
/* sampler */
int sampling_pattern;
- int aa_samples;
- int adaptive_min_samples;
- int adaptive_step;
- int adaptive_stop_per_sample;
- float adaptive_threshold;
/* volume render */
int use_volumes;
int volume_max_steps;
float volume_step_rate;
- int volume_samples;
-
- int start_sample;
- int max_closures;
+ int has_shadow_catcher;
+ /* padding */
int pad1, pad2;
} KernelIntegrator;
static_assert_align(KernelIntegrator, 16);
@@ -1401,14 +1212,19 @@ typedef struct KernelTables {
static_assert_align(KernelTables, 16);
typedef struct KernelBake {
+ int use;
int object_index;
int tri_offset;
- int type;
- int pass_filter;
+ int pad1;
} KernelBake;
static_assert_align(KernelBake, 16);
typedef struct KernelData {
+ uint kernel_features;
+ uint max_closures;
+ uint max_shaders;
+ uint pad;
+
KernelCamera cam;
KernelFilm film;
KernelBackground background;
@@ -1485,11 +1301,10 @@ typedef struct KernelLight {
int type;
float co[3];
int shader_id;
- int samples;
float max_bounces;
float random;
float strength[3];
- float pad1;
+ float pad1, pad2;
Transform tfm;
Transform itfm;
union {
@@ -1539,110 +1354,6 @@ typedef struct KernelShader {
} KernelShader;
static_assert_align(KernelShader, 16);
-/* Declarations required for split kernel */
-
-/* Macro for queues */
-/* Value marking queue's empty slot */
-#define QUEUE_EMPTY_SLOT -1
-
-/*
- * Queue 1 - Active rays
- * Queue 2 - Background queue
- * Queue 3 - Shadow ray cast kernel - AO
- * Queue 4 - Shadow ray cast kernel - direct lighting
- */
-
-/* Queue names */
-enum QueueNumber {
- /* All active rays and regenerated rays are enqueued here. */
- QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,
-
- /* All
- * 1. Background-hit rays,
- * 2. Rays that has exited path-iteration but needs to update output buffer
- * 3. Rays to be regenerated
- * are enqueued here.
- */
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-
- /* All rays for which a shadow ray should be cast to determine radiance
- * contribution for AO are enqueued here.
- */
- QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-
- /* All rays for which a shadow ray should be cast to determine radiance
- * contributing for direct lighting are enqueued here.
- */
- QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-
- /* Rays sorted according to shader->id */
- QUEUE_SHADER_SORTED_RAYS,
-
-#ifdef __BRANCHED_PATH__
- /* All rays moving to next iteration of the indirect loop for light */
- QUEUE_LIGHT_INDIRECT_ITER,
- /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
- QUEUE_INACTIVE_RAYS,
-# ifdef __VOLUME__
- /* All rays moving to next iteration of the indirect loop for volumes */
- QUEUE_VOLUME_INDIRECT_ITER,
-# endif
-# ifdef __SUBSURFACE__
- /* All rays moving to next iteration of the indirect loop for subsurface */
- QUEUE_SUBSURFACE_INDIRECT_ITER,
-# endif
-#endif /* __BRANCHED_PATH__ */
-
- NUM_QUEUES
-};
-
-/* We use RAY_STATE_MASK to get ray_state */
-#define RAY_STATE_MASK 0x0F
-#define RAY_FLAG_MASK 0xF0
-enum RayState {
- RAY_INVALID = 0,
- /* Denotes ray is actively involved in path-iteration. */
- RAY_ACTIVE,
- /* Denotes ray has completed processing all samples and is inactive. */
- RAY_INACTIVE,
- /* Denotes ray has exited path-iteration and needs to update output buffer. */
- RAY_UPDATE_BUFFER,
- /* Denotes ray needs to skip most surface shader work. */
- RAY_HAS_ONLY_VOLUME,
- /* Denotes ray has hit background */
- RAY_HIT_BACKGROUND,
- /* Denotes ray has to be regenerated */
- RAY_TO_REGENERATE,
- /* Denotes ray has been regenerated */
- RAY_REGENERATED,
- /* Denotes ray is moving to next iteration of the branched indirect loop */
- RAY_LIGHT_INDIRECT_NEXT_ITER,
- RAY_VOLUME_INDIRECT_NEXT_ITER,
- RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
-
- /* Ray flags */
-
- /* Flags to denote that the ray is currently evaluating the branched indirect loop */
- RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
- RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
- RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
- RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT |
- RAY_BRANCHED_SUBSURFACE_INDIRECT),
-
- /* Ray is evaluating an iteration of an indirect loop for another thread */
- RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
-};
-
-#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \
- (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) \
- ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
-#define ADD_RAY_FLAG(ray_state, ray_index, flag) \
- (ray_state[ray_index] = (ray_state[ray_index] | flag))
-#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \
- (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
-#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
-
/* Patches */
#define PATCH_MAX_CONTROL_VERTS 16
@@ -1655,7 +1366,7 @@ enum RayState {
/* Work Tiles */
-typedef struct WorkTile {
+typedef struct KernelWorkTile {
uint x, y, w, h;
uint start_sample;
@@ -1664,13 +1375,172 @@ typedef struct WorkTile {
int offset;
uint stride;
- ccl_global float *buffer;
-} WorkTile;
+ /* Precalculated parameters used by init_from_camera kernel on GPU. */
+ int path_index_offset;
+ int work_size;
+} KernelWorkTile;
+
+/* Shader Evaluation.
+ *
+ * Position on a primitive on an object at which we want to evaluate the
+ * shader for e.g. mesh displacement or light importance map. */
+
+typedef struct KernelShaderEvalInput {
+ int object;
+ int prim;
+ float u, v;
+} KernelShaderEvalInput;
+static_assert_align(KernelShaderEvalInput, 16);
/* Pre-computed sample table sizes for PMJ02 sampler. */
-#define NUM_PMJ_SAMPLES (64 * 64)
-#define NUM_PMJ_PATTERNS 48
+#define NUM_PMJ_DIVISIONS 32
+#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
+#define NUM_PMJ_PATTERNS 1
-CCL_NAMESPACE_END
+/* Device kernels.
+ *
+ * Identifier for kernels that can be executed in device queues.
+ *
+ * Some implementation details.
+ *
+ * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
+ * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
+
+typedef enum DeviceKernel {
+ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
+ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
+
+ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+ DEVICE_KERNEL_INTEGRATOR_RESET,
+ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
+
+ DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
+ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
+
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+ DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+ DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+ DECLARE_FILM_CONVERT_KERNEL(MIST),
+ DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+ DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+ DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+ DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+ DECLARE_FILM_CONVERT_KERNEL(MOTION),
+ DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+ DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+ DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+ DECLARE_FILM_CONVERT_KERNEL(COMBINED),
+ DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
+
+ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
+ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,
+ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y,
+
+ DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS,
+ DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO,
+ DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
+ DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,
+
+ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,
+
+ DEVICE_KERNEL_PREFIX_SUM,
+
+ DEVICE_KERNEL_NUM,
+} DeviceKernel;
+
+enum {
+ DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1,
+};
+
+/* Kernel Features */
+
+enum KernelFeatureFlag : unsigned int {
+ /* Shader nodes. */
+ KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
+ KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
+ KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
+ KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
+ KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
+ KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
+ KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
+ KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
+
+ /* Use denoising kernels and output denoising passes. */
+ KERNEL_FEATURE_DENOISING = (1U << 8U),
+
+ /* Use path tracing kernels. */
+ KERNEL_FEATURE_PATH_TRACING = (1U << 9U),
-#endif /* __KERNEL_TYPES_H__ */
+ /* BVH/sampling kernel features. */
+ KERNEL_FEATURE_HAIR = (1U << 10U),
+ KERNEL_FEATURE_HAIR_THICK = (1U << 11U),
+ KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U),
+ KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U),
+
+ /* Denotes whether baking functionality is needed. */
+ KERNEL_FEATURE_BAKING = (1U << 14U),
+
+ /* Use subsurface scattering materials. */
+ KERNEL_FEATURE_SUBSURFACE = (1U << 15U),
+
+ /* Use volume materials. */
+ KERNEL_FEATURE_VOLUME = (1U << 16U),
+
+ /* Use OpenSubdiv patch evaluation */
+ KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U),
+
+ /* Use Transparent shadows */
+ KERNEL_FEATURE_TRANSPARENT = (1U << 18U),
+
+ /* Use shadow catcher. */
+ KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U),
+
+ /* Per-uber shader usage flags. */
+ KERNEL_FEATURE_PRINCIPLED = (1U << 20U),
+
+ /* Light render passes. */
+ KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
+
+ /* Shadow render pass. */
+ KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
+};
+
+/* Shader node feature mask, to specialize shader evaluation for kernels. */
+
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
+ (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
+ (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
+ KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+ KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE \
+ (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE)
+#define KERNEL_FEATURE_NODE_MASK_VOLUME \
+ (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \
+ (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE)
+#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT
+
+#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
deleted file mode 100644
index f6b34be040e..00000000000
--- a/intern/cycles/kernel/kernel_volume.h
+++ /dev/null
@@ -1,1440 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
- * and precision issues.
- * todo: this value could be tweaked or turned into a probability to avoid unnecessary
- * work in volumes and subsurface scattering. */
-#define VOLUME_THROUGHPUT_EPSILON 1e-6f
-
-/* Events for probalistic scattering */
-
-typedef enum VolumeIntegrateResult {
- VOLUME_PATH_SCATTERED = 0,
- VOLUME_PATH_ATTENUATED = 1,
- VOLUME_PATH_MISSED = 2
-} VolumeIntegrateResult;
-
-/* Volume shader properties
- *
- * extinction coefficient = absorption coefficient + scattering coefficient
- * sigma_t = sigma_a + sigma_s */
-
-typedef struct VolumeShaderCoefficients {
- float3 sigma_t;
- float3 sigma_s;
- float3 emission;
-} VolumeShaderCoefficients;
-
-#ifdef __VOLUME__
-
-/* evaluate shader to get extinction coefficient at P */
-ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- float3 P,
- float3 *extinction)
-{
- sd->P = P;
- shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
-
- if (sd->flag & SD_EXTINCTION) {
- const float density = object_volume_density(kg, sd->object);
- *extinction = sd->closure_transparent_extinction * density;
- return true;
- }
- else {
- return false;
- }
-}
-
-/* evaluate shader to get absorption, scattering and emission at P */
-ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- float3 P,
- VolumeShaderCoefficients *coeff)
-{
- sd->P = P;
- shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
-
- if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION)))
- return false;
-
- coeff->sigma_s = zero_float3();
- coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
- coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
-
- if (sd->flag & SD_SCATTER) {
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- if (CLOSURE_IS_VOLUME(sc->type))
- coeff->sigma_s += sc->weight;
- }
- }
-
- const float density = object_volume_density(kg, sd->object);
- coeff->sigma_s *= density;
- coeff->sigma_t *= density;
- coeff->emission *= density;
-
- return true;
-}
-
-#endif /* __VOLUME__ */
-
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
-{
- return exp3(-sigma * t);
-}
-
-ccl_device float kernel_volume_channel_get(float3 value, int channel)
-{
- return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
-}
-
-#ifdef __VOLUME__
-
-ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
-{
- float step_size = FLT_MAX;
-
- for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
- int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
- bool heterogeneous = false;
-
- if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
- heterogeneous = true;
- }
- else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
- /* We want to render world or objects without any volume grids
- * as homogeneous, but can only verify this at run-time since other
- * heterogeneous volume objects may be using the same shader. */
- int object = stack[i].object;
- if (object != OBJECT_NONE) {
- int object_flag = kernel_tex_fetch(__object_flag, object);
- if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
- heterogeneous = true;
- }
- }
- }
-
- if (heterogeneous) {
- float object_step_size = object_volume_step_size(kg, stack[i].object);
- object_step_size *= kernel_data.integrator.volume_step_rate;
- step_size = fminf(object_step_size, step_size);
- }
- }
-
- return step_size;
-}
-
-ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
-{
- if (kernel_data.integrator.num_all_lights == 0)
- return 0;
-
- int method = -1;
-
- for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
- int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
- if (shader_flag & SD_VOLUME_MIS) {
- return SD_VOLUME_MIS;
- }
- else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
- if (method == 0)
- return SD_VOLUME_MIS;
-
- method = SD_VOLUME_EQUIANGULAR;
- }
- else {
- if (method == SD_VOLUME_EQUIANGULAR)
- return SD_VOLUME_MIS;
-
- method = 0;
- }
- }
-
- return method;
-}
-
-ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- const float object_step_size,
- float t,
- float *step_size,
- float *step_shade_offset,
- float *steps_offset)
-{
- const int max_steps = kernel_data.integrator.volume_max_steps;
- float step = min(object_step_size, t);
-
- /* compute exact steps in advance for malloc */
- if (t > max_steps * step) {
- step = t / (float)max_steps;
- }
-
- *step_size = step;
-
- /* Perform shading at this offset within a step, to integrate over
- * over the entire step segment. */
- *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4);
-
- /* Shift starting point of all segment by this random amount to avoid
- * banding artifacts from the volume bounding shape. */
- *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3);
-}
-
-/* Volume Shadows
- *
- * These functions are used to attenuate shadow rays to lights. Both absorption
- * and scattering will block light, represented by the extinction coefficient. */
-
-/* homogeneous volume: assume shader evaluation at the starts gives
- * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- ShaderData *sd,
- float3 *throughput)
-{
- float3 sigma_t = zero_float3();
-
- if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
- *throughput *= volume_color_transmittance(sigma_t, ray->t);
-}
-
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- ShaderData *sd,
- float3 *throughput,
- const float object_step_size)
-{
- float3 tp = *throughput;
-
- /* Prepare for stepping.
- * For shadows we do not offset all segments, since the starting point is
- * already a random distance inside the volume. It also appears to create
- * banding artifacts for unknown reasons. */
- int max_steps = kernel_data.integrator.volume_max_steps;
- float step_size, step_shade_offset, unused;
- kernel_volume_step_init(
- kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused);
- const float steps_offset = 1.0f;
-
- /* compute extinction at the start */
- float t = 0.0f;
-
- float3 sum = zero_float3();
-
- for (int i = 0; i < max_steps; i++) {
- /* advance to new position */
- float new_t = min(ray->t, (i + steps_offset) * step_size);
- float dt = new_t - t;
-
- float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
- float3 sigma_t = zero_float3();
-
- /* compute attenuation over segment */
- if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
- /* Compute expf() only for every Nth step, to save some calculations
- * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
- * check then. */
- sum += (-sigma_t * dt);
- if ((i & 0x07) == 0) { /* ToDo: Other interval? */
- tp = *throughput * exp3(sum);
-
- /* stop if nearly all light is blocked */
- if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
- tp.z < VOLUME_THROUGHPUT_EPSILON)
- break;
- }
- }
-
- /* stop if at the end of the volume */
- t = new_t;
- if (t == ray->t) {
- /* Update throughput in case we haven't done it above */
- tp = *throughput * exp3(sum);
- break;
- }
- }
-
- *throughput = tp;
-}
-
-/* get the volume attenuation over line segment defined by ray, with the
- * assumption that there are no surfaces blocking light between the endpoints */
-# if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- Ray *ray,
- float3 *throughput)
-{
- optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
-}
-extern "C" __device__ void __direct_callable__kernel_volume_shadow(
-# else
-ccl_device_noinline void kernel_volume_shadow(
-# endif
- KernelGlobals *kg,
- ShaderData *shadow_sd,
- ccl_addr_space PathState *state,
- Ray *ray,
- float3 *throughput)
-{
- shader_setup_from_volume(kg, shadow_sd, ray);
-
- float step_size = volume_stack_step_size(kg, state->volume_stack);
- if (step_size != FLT_MAX)
- kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
- else
- kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
-}
-
-#endif /* __VOLUME__ */
-
-/* Equi-angular sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
-{
- float t = ray->t;
-
- float delta = dot((light_P - ray->P), ray->D);
- float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
- if (UNLIKELY(D == 0.0f)) {
- *pdf = 0.0f;
- return 0.0f;
- }
- float theta_a = -atan2f(delta, D);
- float theta_b = atan2f(t - delta, D);
- float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
- if (UNLIKELY(theta_b == theta_a)) {
- *pdf = 0.0f;
- return 0.0f;
- }
- *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
- return min(t, delta + t_); /* min is only for float precision errors */
-}
-
-ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
-{
- float delta = dot((light_P - ray->P), ray->D);
- float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
- if (UNLIKELY(D == 0.0f)) {
- return 0.0f;
- }
-
- float t = ray->t;
- float t_ = sample_t - delta;
-
- float theta_a = -atan2f(delta, D);
- float theta_b = atan2f(t - delta, D);
- if (UNLIKELY(theta_b == theta_a)) {
- return 0.0f;
- }
-
- float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
- return pdf;
-}
-
-/* Distance sampling */
-
-ccl_device float kernel_volume_distance_sample(
- float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
-{
- /* xi is [0, 1[ so log(0) should never happen, division by zero is
- * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
- float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
- float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
- float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
-
- float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
-
- *transmittance = volume_color_transmittance(sigma_t, sample_t);
- *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
-
- /* todo: optimization: when taken together with hit/miss decision,
- * the full_transmittance cancels out drops out and xi does not
- * need to be remapped */
-
- return sample_t;
-}
-
-ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
-{
- float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
- float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
-
- return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
-}
-
-/* Emission */
-
-ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff,
- int closure_flag,
- float3 transmittance,
- float t)
-{
- /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
- * this goes to E * t as sigma_t goes to zero
- *
- * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
- float3 emission = coeff->emission;
-
- if (closure_flag & SD_EXTINCTION) {
- float3 sigma_t = coeff->sigma_t;
-
- emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
- emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
- emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
- }
- else
- emission *= t;
-
- return emission;
-}
-
-/* Volume Path */
-
-ccl_device int kernel_volume_sample_channel(float3 albedo,
- float3 throughput,
- float rand,
- float3 *pdf)
-{
- /* Sample color channel proportional to throughput and single scattering
- * albedo, to significantly reduce noise with many bounce, following:
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
- float3 weights = fabs(throughput * albedo);
- float sum_weights = weights.x + weights.y + weights.z;
- float3 weights_pdf;
-
- if (sum_weights > 0.0f) {
- weights_pdf = weights / sum_weights;
- }
- else {
- weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
- }
-
- *pdf = weights_pdf;
-
- /* OpenCL does not support -> on float3, so don't use pdf->x. */
- if (rand < weights_pdf.x) {
- return 0;
- }
- else if (rand < weights_pdf.x + weights_pdf.y) {
- return 1;
- }
- else {
- return 2;
- }
-}
-
-#ifdef __VOLUME__
-
-/* homogeneous volume: assume shader evaluation at the start gives
- * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_homogeneous(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- ShaderData *sd,
- PathRadiance *L,
- ccl_addr_space float3 *throughput,
- bool probalistic_scatter)
-{
- VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
- if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
- return VOLUME_PATH_MISSED;
-
- int closure_flag = sd->flag;
- float t = ray->t;
- float3 new_tp;
-
-# ifdef __VOLUME_SCATTER__
- /* randomly scatter, and if we do t is shortened */
- if (closure_flag & SD_SCATTER) {
- /* Sample channel, use MIS with balance heuristic. */
- float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
- float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
- float3 channel_pdf;
- int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
-
- /* decide if we will hit or miss */
- bool scatter = true;
- float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
- if (probalistic_scatter) {
- float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
- float sample_transmittance = expf(-sample_sigma_t * t);
-
- if (1.0f - xi >= sample_transmittance) {
- scatter = true;
-
- /* rescale random number so we can reuse it */
- xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
- }
- else
- scatter = false;
- }
-
- if (scatter) {
- /* scattering */
- float3 pdf;
- float3 transmittance;
- float sample_t;
-
- /* distance sampling */
- sample_t = kernel_volume_distance_sample(
- ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
-
- /* modify pdf for hit/miss decision */
- if (probalistic_scatter)
- pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t);
-
- new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
- t = sample_t;
- }
- else {
- /* no scattering */
- float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
- float pdf = dot(channel_pdf, transmittance);
- new_tp = *throughput * transmittance / pdf;
- }
- }
- else
-# endif
- if (closure_flag & SD_EXTINCTION) {
- /* absorption only, no sampling needed */
- float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
- new_tp = *throughput * transmittance;
- }
- else {
- new_tp = *throughput;
- }
-
- /* integrate emission attenuated by extinction */
- if (L && (closure_flag & SD_EMISSION)) {
- float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
- float3 emission = kernel_volume_emission_integrate(
- &coeff, closure_flag, transmittance, ray->t);
- path_radiance_accum_emission(kg, L, state, *throughput, emission);
- }
-
- /* modify throughput */
- if (closure_flag & SD_EXTINCTION) {
- *throughput = new_tp;
-
- /* prepare to scatter to new direction */
- if (t < ray->t) {
- /* adjust throughput and move to new location */
- sd->P = ray->P + t * ray->D;
-
- return VOLUME_PATH_SCATTERED;
- }
- }
-
- return VOLUME_PATH_ATTENUATED;
-}
-
-/* heterogeneous volume distance sampling: integrate stepping through the
- * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probabilistically scatter or get transmitted through
- * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- Ray *ray,
- ShaderData *sd,
- PathRadiance *L,
- ccl_addr_space float3 *throughput,
- const float object_step_size)
-{
- float3 tp = *throughput;
-
- /* Prepare for stepping.
- * Using a different step offset for the first step avoids banding artifacts. */
- int max_steps = kernel_data.integrator.volume_max_steps;
- float step_size, step_shade_offset, steps_offset;
- kernel_volume_step_init(
- kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
- /* compute coefficients at the start */
- float t = 0.0f;
- float3 accum_transmittance = one_float3();
-
- /* pick random color channel, we use the Veach one-sample
- * model with balance heuristic for the channels */
- float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
- float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
- bool has_scatter = false;
-
- for (int i = 0; i < max_steps; i++) {
- /* advance to new position */
- float new_t = min(ray->t, (i + steps_offset) * step_size);
- float dt = new_t - t;
-
- float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
- VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
- /* compute segment */
- if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
- int closure_flag = sd->flag;
- float3 new_tp;
- float3 transmittance;
- bool scatter = false;
-
- /* distance sampling */
-# ifdef __VOLUME_SCATTER__
- if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
- has_scatter = true;
-
- /* Sample channel, use MIS with balance heuristic. */
- float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
- float3 channel_pdf;
- int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
-
- /* compute transmittance over full step */
- transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-
- /* decide if we will scatter or continue */
- float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
-
- if (1.0f - xi >= sample_transmittance) {
- /* compute sampling distance */
- float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
- float new_dt = -logf(1.0f - xi) / sample_sigma_t;
- new_t = t + new_dt;
-
- /* transmittance and pdf */
- float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
- float3 pdf = coeff.sigma_t * new_transmittance;
-
- /* throughput */
- new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
- scatter = true;
- }
- else {
- /* throughput */
- float pdf = dot(channel_pdf, transmittance);
- new_tp = tp * transmittance / pdf;
-
- /* remap xi so we can reuse it and keep thing stratified */
- xi = 1.0f - (1.0f - xi) / sample_transmittance;
- }
- }
- else
-# endif
- if (closure_flag & SD_EXTINCTION) {
- /* absorption only, no sampling needed */
- transmittance = volume_color_transmittance(coeff.sigma_t, dt);
- new_tp = tp * transmittance;
- }
- else {
- transmittance = zero_float3();
- new_tp = tp;
- }
-
- /* integrate emission attenuated by absorption */
- if (L && (closure_flag & SD_EMISSION)) {
- float3 emission = kernel_volume_emission_integrate(
- &coeff, closure_flag, transmittance, dt);
- path_radiance_accum_emission(kg, L, state, tp, emission);
- }
-
- /* modify throughput */
- if (closure_flag & SD_EXTINCTION) {
- tp = new_tp;
-
- /* stop if nearly all light blocked */
- if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
- tp.z < VOLUME_THROUGHPUT_EPSILON) {
- tp = zero_float3();
- break;
- }
- }
-
- /* prepare to scatter to new direction */
- if (scatter) {
- /* adjust throughput and move to new location */
- sd->P = ray->P + new_t * ray->D;
- *throughput = tp;
-
- return VOLUME_PATH_SCATTERED;
- }
- else {
- /* accumulate transmittance */
- accum_transmittance *= transmittance;
- }
- }
-
- /* stop if at the end of the volume */
- t = new_t;
- if (t == ray->t)
- break;
- }
-
- *throughput = tp;
-
- return VOLUME_PATH_ATTENUATED;
-}
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints. distance sampling is used to decide if we will
- * scatter or not. */
-ccl_device_noinline_cpu VolumeIntegrateResult
-kernel_volume_integrate(KernelGlobals *kg,
- ccl_addr_space PathState *state,
- ShaderData *sd,
- Ray *ray,
- PathRadiance *L,
- ccl_addr_space float3 *throughput,
- float step_size)
-{
- shader_setup_from_volume(kg, sd, ray);
-
- if (step_size != FLT_MAX)
- return kernel_volume_integrate_heterogeneous_distance(
- kg, state, ray, sd, L, throughput, step_size);
- else
- return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
-}
-
-# ifndef __SPLIT_KERNEL__
-/* Decoupled Volume Sampling
- *
- * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then later be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media"
- *
- * On the GPU this is only supported (but currently not enabled)
- * for homogeneous volumes (1 step), due to
- * no support for malloc/free and too much stack usage with a fix size array. */
-
-typedef struct VolumeStep {
- float3 sigma_s; /* scatter coefficient */
- float3 sigma_t; /* extinction coefficient */
- float3 accum_transmittance; /* accumulated transmittance including this step */
- float3 cdf_distance; /* cumulative density function for distance sampling */
- float t; /* distance at end of this step */
- float shade_t; /* jittered distance where shading was done in step */
- int closure_flag; /* shader evaluation closure flags */
-} VolumeStep;
-
-typedef struct VolumeSegment {
- VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */
- VolumeStep *steps; /* recorded steps */
- int numsteps; /* number of steps */
- int closure_flag; /* accumulated closure flags from all steps */
-
- float3 accum_emission; /* accumulated emission at end of segment */
- float3 accum_transmittance; /* accumulated transmittance at end of segment */
- float3 accum_albedo; /* accumulated average albedo over segment */
-
- int sampling_method; /* volume sampling method */
-} VolumeSegment;
-
-/* record volume steps to the end of the volume.
- *
- * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probabilistically
- * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratified distance samples up to that transmittance */
-# ifdef __VOLUME_DECOUPLED__
-ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
- PathState *state,
- Ray *ray,
- ShaderData *sd,
- VolumeSegment *segment,
- const float object_step_size)
-{
- /* prepare for volume stepping */
- int max_steps;
- float step_size, step_shade_offset, steps_offset;
-
- if (object_step_size != FLT_MAX) {
- max_steps = kernel_data.integrator.volume_max_steps;
- kernel_volume_step_init(
- kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-# ifdef __KERNEL_CPU__
- /* NOTE: For the branched path tracing it's possible to have direct
- * and indirect light integration both having volume segments allocated.
- * We detect this using index in the pre-allocated memory. Currently we
- * only support two segments allocated at a time, if more needed some
- * modifications to the KernelGlobals will be needed.
- *
- * This gives us restrictions that decoupled record should only happen
- * in the stack manner, meaning if there's subsequent call of decoupled
- * record it'll need to free memory before its caller frees memory.
- */
- const int index = kg->decoupled_volume_steps_index;
- assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps));
- if (kg->decoupled_volume_steps[index] == NULL) {
- kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
- }
- segment->steps = kg->decoupled_volume_steps[index];
- ++kg->decoupled_volume_steps_index;
-# else
- segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-# endif
- }
- else {
- max_steps = 1;
- step_size = ray->t;
- step_shade_offset = 0.0f;
- steps_offset = 1.0f;
- segment->steps = &segment->stack_step;
- }
-
- /* init accumulation variables */
- float3 accum_emission = zero_float3();
- float3 accum_transmittance = one_float3();
- float3 accum_albedo = zero_float3();
- float3 cdf_distance = zero_float3();
- float t = 0.0f;
-
- segment->numsteps = 0;
- segment->closure_flag = 0;
- bool is_last_step_empty = false;
-
- VolumeStep *step = segment->steps;
-
- for (int i = 0; i < max_steps; i++, step++) {
- /* advance to new position */
- float new_t = min(ray->t, (i + steps_offset) * step_size);
- float dt = new_t - t;
-
- float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
- VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
- /* compute segment */
- if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
- int closure_flag = sd->flag;
- float3 sigma_t = coeff.sigma_t;
-
- /* compute average albedo for channel sampling */
- if (closure_flag & SD_SCATTER) {
- accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t);
- }
-
- /* compute accumulated transmittance */
- float3 transmittance = volume_color_transmittance(sigma_t, dt);
-
- /* compute emission attenuated by absorption */
- if (closure_flag & SD_EMISSION) {
- float3 emission = kernel_volume_emission_integrate(
- &coeff, closure_flag, transmittance, dt);
- accum_emission += accum_transmittance * emission;
- }
-
- accum_transmittance *= transmittance;
-
- /* compute pdf for distance sampling */
- float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
- cdf_distance = cdf_distance + pdf_distance;
-
- /* write step data */
- step->sigma_t = sigma_t;
- step->sigma_s = coeff.sigma_s;
- step->closure_flag = closure_flag;
-
- segment->closure_flag |= closure_flag;
-
- is_last_step_empty = false;
- segment->numsteps++;
- }
- else {
- if (is_last_step_empty) {
- /* consecutive empty step, merge */
- step--;
- }
- else {
- /* store empty step */
- step->sigma_t = zero_float3();
- step->sigma_s = zero_float3();
- step->closure_flag = 0;
-
- segment->numsteps++;
- is_last_step_empty = true;
- }
- }
-
- step->accum_transmittance = accum_transmittance;
- step->cdf_distance = cdf_distance;
- step->t = new_t;
- step->shade_t = t + dt * step_shade_offset;
-
- /* stop if at the end of the volume */
- t = new_t;
- if (t == ray->t)
- break;
-
- /* stop if nearly all light blocked */
- if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON &&
- accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON &&
- accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON)
- break;
- }
-
- /* store total emission and transmittance */
- segment->accum_emission = accum_emission;
- segment->accum_transmittance = accum_transmittance;
- segment->accum_albedo = accum_albedo;
-
- /* normalize cumulative density function for distance sampling */
- VolumeStep *last_step = segment->steps + segment->numsteps - 1;
-
- if (!is_zero(last_step->cdf_distance)) {
- VolumeStep *step = &segment->steps[0];
- int numsteps = segment->numsteps;
- float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
-
- for (int i = 0; i < numsteps; i++, step++)
- step->cdf_distance *= inv_cdf_distance_sum;
- }
-}
-
-ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
-{
- if (segment->steps != &segment->stack_step) {
-# ifdef __KERNEL_CPU__
- /* NOTE: We only allow free last allocated segment.
- * No random order of alloc/free is supported.
- */
- assert(kg->decoupled_volume_steps_index > 0);
- assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
- --kg->decoupled_volume_steps_index;
-# else
- free(segment->steps);
-# endif
- }
-}
-# endif /* __VOLUME_DECOUPLED__ */
-
-/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching.
- *
- * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
-ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg,
- PathState *state,
- Ray *ray,
- ShaderData *sd,
- float3 *throughput,
- float rphase,
- float rscatter,
- const VolumeSegment *segment,
- const float3 *light_P,
- bool probalistic_scatter)
-{
- kernel_assert(segment->closure_flag & SD_SCATTER);
-
- /* Sample color channel, use MIS with balance heuristic. */
- float3 channel_pdf;
- int channel = kernel_volume_sample_channel(
- segment->accum_albedo, *throughput, rphase, &channel_pdf);
-
- float xi = rscatter;
-
- /* probabilistic scattering decision based on transmittance */
- if (probalistic_scatter) {
- float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
-
- if (1.0f - xi >= sample_transmittance) {
- /* rescale random number so we can reuse it */
- xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
- }
- else {
- *throughput /= sample_transmittance;
- return VOLUME_PATH_MISSED;
- }
- }
-
- VolumeStep *step;
- float3 transmittance;
- float pdf, sample_t;
- float mis_weight = 1.0f;
- bool distance_sample = true;
- bool use_mis = false;
-
- if (segment->sampling_method && light_P) {
- if (segment->sampling_method == SD_VOLUME_MIS) {
- /* multiple importance sample: randomly pick between
- * equiangular and distance sampling strategy */
- if (xi < 0.5f) {
- xi *= 2.0f;
- }
- else {
- xi = (xi - 0.5f) * 2.0f;
- distance_sample = false;
- }
-
- use_mis = true;
- }
- else {
- /* only equiangular sampling */
- distance_sample = false;
- }
- }
-
- /* distance sampling */
- if (distance_sample) {
- /* find step in cdf */
- step = segment->steps;
-
- float prev_t = 0.0f;
- float3 step_pdf_distance = one_float3();
-
- if (segment->numsteps > 1) {
- float prev_cdf = 0.0f;
- float step_cdf = 1.0f;
- float3 prev_cdf_distance = zero_float3();
-
- for (int i = 0;; i++, step++) {
- /* todo: optimize using binary search */
- step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
-
- if (xi < step_cdf || i == segment->numsteps - 1)
- break;
-
- prev_cdf = step_cdf;
- prev_t = step->t;
- prev_cdf_distance = step->cdf_distance;
- }
-
- /* remap xi so we can reuse it */
- xi = (xi - prev_cdf) / (step_cdf - prev_cdf);
-
- /* pdf for picking step */
- step_pdf_distance = step->cdf_distance - prev_cdf_distance;
- }
-
- /* determine range in which we will sample */
- float step_t = step->t - prev_t;
-
- /* sample distance and compute transmittance */
- float3 distance_pdf;
- sample_t = prev_t + kernel_volume_distance_sample(
- step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-
- /* modify pdf for hit/miss decision */
- if (probalistic_scatter)
- distance_pdf *= one_float3() - segment->accum_transmittance;
-
- pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
-
- /* multiple importance sampling */
- if (use_mis) {
- float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
- mis_weight = 2.0f * power_heuristic(pdf, equi_pdf);
- }
- }
- /* equi-angular sampling */
- else {
- /* sample distance */
- sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
-
- /* find step in which sampled distance is located */
- step = segment->steps;
-
- float prev_t = 0.0f;
- float3 step_pdf_distance = one_float3();
-
- if (segment->numsteps > 1) {
- float3 prev_cdf_distance = zero_float3();
-
- int numsteps = segment->numsteps;
- int high = numsteps - 1;
- int low = 0;
- int mid;
-
- while (low < high) {
- mid = (low + high) >> 1;
-
- if (sample_t < step[mid].t)
- high = mid;
- else if (sample_t >= step[mid + 1].t)
- low = mid + 1;
- else {
- /* found our interval in step[mid] .. step[mid+1] */
- prev_t = step[mid].t;
- prev_cdf_distance = step[mid].cdf_distance;
- step += mid + 1;
- break;
- }
- }
-
- if (low >= numsteps - 1) {
- prev_t = step[numsteps - 1].t;
- prev_cdf_distance = step[numsteps - 1].cdf_distance;
- step += numsteps - 1;
- }
-
- /* pdf for picking step with distance sampling */
- step_pdf_distance = step->cdf_distance - prev_cdf_distance;
- }
-
- /* determine range in which we will sample */
- float step_t = step->t - prev_t;
- float step_sample_t = sample_t - prev_t;
-
- /* compute transmittance */
- transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
-
- /* multiple importance sampling */
- if (use_mis) {
- float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
- float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
- mis_weight = 2.0f * power_heuristic(pdf, distance_pdf);
- }
- }
- if (sample_t < 0.0f || pdf == 0.0f) {
- return VOLUME_PATH_MISSED;
- }
-
- /* compute transmittance up to this step */
- if (step != segment->steps)
- transmittance *= (step - 1)->accum_transmittance;
-
- /* modify throughput */
- *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
-
- /* evaluate shader to create closures at shading point */
- if (segment->numsteps > 1) {
- sd->P = ray->P + step->shade_t * ray->D;
-
- VolumeShaderCoefficients coeff;
- volume_shader_sample(kg, sd, state, sd->P, &coeff);
- }
-
- /* move to new position */
- sd->P = ray->P + sample_t * ray->D;
-
- return VOLUME_PATH_SCATTERED;
-}
-# endif /* __SPLIT_KERNEL */
-
-/* decide if we need to use decoupled or not */
-ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg,
- bool heterogeneous,
- bool direct,
- int sampling_method)
-{
- /* decoupled ray marching for heterogeneous volumes not supported on the GPU,
- * which also means equiangular and multiple importance sampling is not
- * support for that case */
- if (!kernel_data.integrator.volume_decoupled)
- return false;
-
-# ifdef __KERNEL_GPU__
- if (heterogeneous)
- return false;
-# endif
-
- /* equiangular and multiple importance sampling only implemented for decoupled */
- if (sampling_method != 0)
- return true;
-
- /* for all light sampling use decoupled, reusing shader evaluations is
- * typically faster in that case */
- if (direct)
- return kernel_data.integrator.sample_all_lights_direct;
- else
- return kernel_data.integrator.sample_all_lights_indirect;
-}
-
-/* Volume Stack
- *
- * This is an array of object/shared ID's that the current segment of the path
- * is inside of. */
-
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
- ShaderData *stack_sd,
- ccl_addr_space const PathState *state,
- ccl_addr_space const Ray *ray,
- ccl_addr_space VolumeStack *stack)
-{
- /* NULL ray happens in the baker, does it need proper initialization of
- * camera in volume?
- */
- if (!kernel_data.cam.is_inside_volume || ray == NULL) {
- /* Camera is guaranteed to be in the air, only take background volume
- * into account in this case.
- */
- if (kernel_data.background.volume_shader != SHADER_NONE) {
- stack[0].shader = kernel_data.background.volume_shader;
- stack[0].object = PRIM_NONE;
- stack[1].shader = SHADER_NONE;
- }
- else {
- stack[0].shader = SHADER_NONE;
- }
- return;
- }
-
- kernel_assert(state->flag & PATH_RAY_CAMERA);
-
- Ray volume_ray = *ray;
- volume_ray.t = FLT_MAX;
-
- const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY);
- int stack_index = 0, enclosed_index = 0;
-
-# ifdef __VOLUME_RECORD_ALL__
- Intersection hits[2 * VOLUME_STACK_SIZE + 1];
- uint num_hits = scene_intersect_volume_all(
- kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
- if (num_hits > 0) {
- int enclosed_volumes[VOLUME_STACK_SIZE];
- Intersection *isect = hits;
-
- qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
- for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
- shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
- if (stack_sd->flag & SD_BACKFACING) {
- bool need_add = true;
- for (int i = 0; i < enclosed_index && need_add; ++i) {
- /* If ray exited the volume and never entered to that volume
- * it means that camera is inside such a volume.
- */
- if (enclosed_volumes[i] == stack_sd->object) {
- need_add = false;
- }
- }
- for (int i = 0; i < stack_index && need_add; ++i) {
- /* Don't add intersections twice. */
- if (stack[i].object == stack_sd->object) {
- need_add = false;
- break;
- }
- }
- if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
- stack[stack_index].object = stack_sd->object;
- stack[stack_index].shader = stack_sd->shader;
- ++stack_index;
- }
- }
- else {
- /* If ray from camera enters the volume, this volume shouldn't
- * be added to the stack on exit.
- */
- enclosed_volumes[enclosed_index++] = stack_sd->object;
- }
- }
- }
-# else
- int enclosed_volumes[VOLUME_STACK_SIZE];
- int step = 0;
-
- while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
- step < 2 * VOLUME_STACK_SIZE) {
- Intersection isect;
- if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
- break;
- }
-
- shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
- if (stack_sd->flag & SD_BACKFACING) {
- /* If ray exited the volume and never entered to that volume
- * it means that camera is inside such a volume.
- */
- bool need_add = true;
- for (int i = 0; i < enclosed_index && need_add; ++i) {
- /* If ray exited the volume and never entered to that volume
- * it means that camera is inside such a volume.
- */
- if (enclosed_volumes[i] == stack_sd->object) {
- need_add = false;
- }
- }
- for (int i = 0; i < stack_index && need_add; ++i) {
- /* Don't add intersections twice. */
- if (stack[i].object == stack_sd->object) {
- need_add = false;
- break;
- }
- }
- if (need_add) {
- stack[stack_index].object = stack_sd->object;
- stack[stack_index].shader = stack_sd->shader;
- ++stack_index;
- }
- }
- else {
- /* If ray from camera enters the volume, this volume shouldn't
- * be added to the stack on exit.
- */
- enclosed_volumes[enclosed_index++] = stack_sd->object;
- }
-
- /* Move ray forward. */
- volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
- ++step;
- }
-# endif
- /* stack_index of 0 means quick checks outside of the kernel gave false
- * positive, nothing to worry about, just we've wasted quite a few of
- * ticks just to come into conclusion that camera is in the air.
- *
- * In this case we're doing the same above -- check whether background has
- * volume.
- */
- if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
- stack[0].shader = kernel_data.background.volume_shader;
- stack[0].object = OBJECT_NONE;
- stack[1].shader = SHADER_NONE;
- }
- else {
- stack[stack_index].shader = SHADER_NONE;
- }
-}
-
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space VolumeStack *stack)
-{
- /* todo: we should have some way for objects to indicate if they want the
- * world shader to work inside them. excluding it by default is problematic
- * because non-volume objects can't be assumed to be closed manifolds */
-
- if (!(sd->flag & SD_HAS_VOLUME))
- return;
-
- if (sd->flag & SD_BACKFACING) {
- /* exit volume object: remove from stack */
- for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
- if (stack[i].object == sd->object) {
- /* shift back next stack entries */
- do {
- stack[i] = stack[i + 1];
- i++;
- } while (stack[i].shader != SHADER_NONE);
-
- return;
- }
- }
- }
- else {
- /* enter volume object: add to stack */
- int i;
-
- for (i = 0; stack[i].shader != SHADER_NONE; i++) {
- /* already in the stack? then we have nothing to do */
- if (stack[i].object == sd->object)
- return;
- }
-
- /* if we exceed the stack limit, ignore */
- if (i >= VOLUME_STACK_SIZE - 1)
- return;
-
- /* add to the end of the stack */
- stack[i].shader = sd->shader;
- stack[i].object = sd->object;
- stack[i + 1].shader = SHADER_NONE;
- }
-}
-
-# ifdef __SUBSURFACE__
-ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
- ShaderData *stack_sd,
- Ray *ray,
- ccl_addr_space VolumeStack *stack)
-{
- kernel_assert(kernel_data.integrator.use_volumes);
-
- Ray volume_ray = *ray;
-
-# ifdef __VOLUME_RECORD_ALL__
- Intersection hits[2 * VOLUME_STACK_SIZE + 1];
- uint num_hits = scene_intersect_volume_all(
- kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
- if (num_hits > 0) {
- Intersection *isect = hits;
-
- qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
- for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
- shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
- kernel_volume_stack_enter_exit(kg, stack_sd, stack);
- }
- }
-# else
- Intersection isect;
- int step = 0;
- float3 Pend = ray->P + ray->D * ray->t;
- while (step < 2 * VOLUME_STACK_SIZE &&
- scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
- shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
- kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-
- /* Move ray forward. */
- volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
- if (volume_ray.t != FLT_MAX) {
- volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
- }
- ++step;
- }
-# endif
-}
-# endif
-
-/* Clean stack after the last bounce.
- *
- * It is expected that all volumes are closed manifolds, so at the time when ray
- * hits nothing (for example, it is a last bounce which goes to environment) the
- * only expected volume in the stack is the world's one. All the rest volume
- * entries should have been exited already.
- *
- * This isn't always true because of ray intersection precision issues, which
- * could lead us to an infinite non-world volume in the stack, causing render
- * artifacts.
- *
- * Use this function after the last bounce to get rid of all volumes apart from
- * the world's one after the last bounce to avoid render artifacts.
- */
-ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
- ccl_addr_space VolumeStack *volume_stack)
-{
- if (kernel_data.background.volume_shader != SHADER_NONE) {
- /* Keep the world's volume in stack. */
- volume_stack[1].shader = SHADER_NONE;
- }
- else {
- volume_stack[0].shader = SHADER_NONE;
- }
-}
-
-#endif /* __VOLUME__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
*/
/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
uint global_work_index,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *sample)
{
-#ifdef __KERNEL_CUDA__
- /* Keeping threads for the same pixel together improves performance on CUDA. */
- uint sample_offset = global_work_index % tile->num_samples;
- uint pixel_offset = global_work_index / tile->num_samples;
-#else /* __KERNEL_CUDA__ */
+#if 0
+ /* Keep threads for the same sample together. */
uint tile_pixels = tile->w * tile->h;
uint sample_offset = global_work_index / tile_pixels;
uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+ /* Keeping threads for the same pixel together.
+ * Appears to improve performance by a few % on CUDA and OptiX. */
+ uint sample_offset = global_work_index % tile->num_samples;
+ uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
uint y_offset = pixel_offset / tile->w;
uint x_offset = pixel_offset - y_offset * tile->w;
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
*sample = tile->start_sample + sample_offset;
}
-#ifdef __KERNEL_OPENCL__
-# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
- ccl_global uint *work_pools,
- uint total_work_size,
- uint ray_index,
- ccl_private uint *global_work_index)
-{
- /* With a small amount of work there may be more threads than work due to
- * rounding up of global size, stop such threads immediately. */
- if (ray_index >= total_work_size) {
- return false;
- }
-
- /* Increase atomic work index counter in pool. */
- uint pool = ray_index / WORK_POOL_SIZE;
- uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
- /* Map per-pool work index to a global work index. */
- uint global_size = ccl_global_size(0) * ccl_global_size(1);
- kernel_assert(global_size % WORK_POOL_SIZE == 0);
- kernel_assert(ray_index < global_size);
-
- *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
- (work_index % WORK_POOL_SIZE);
-
- /* Test if all work for this pool is done. */
- return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
- ccl_global uint *work_pools,
- uint total_work_size,
- uint ray_index,
- ccl_private uint *global_work_index)
-{
- bool got_work = false;
- if (kernel_data.film.pass_adaptive_aux_buffer) {
- do {
- got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
- if (got_work) {
- ccl_global WorkTile *tile = &kernel_split_params.tile;
- uint x, y, sample;
- get_work_pixel(tile, *global_work_index, &x, &y, &sample);
- uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w == 0.0f) {
- break;
- }
- }
- } while (got_work);
- }
- else {
- got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
- }
- return got_work;
-}
-#endif
-
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
index 410218d91d4..9d379495629 100644
--- a/intern/cycles/kernel/kernel_write_passes.h
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -14,23 +14,25 @@
* limitations under the License.
*/
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#pragma once
+
+#ifdef __KERNEL_GPU__
# define __ATOMIC_PASS_WRITE__
#endif
CCL_NAMESPACE_BEGIN
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
{
- ccl_global float *buf = buffer;
#ifdef __ATOMIC_PASS_WRITE__
- atomic_add_and_fetch_float(buf, value);
+ atomic_add_and_fetch_float(buffer, value);
#else
- *buf += value;
+ *buffer += value;
#endif
}
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+ float3 value)
{
#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
@@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
atomic_add_and_fetch_float(buf_y, value.y);
atomic_add_and_fetch_float(buf_z, value.z);
#else
- ccl_global float3 *buf = (ccl_global float3 *)buffer;
- *buf += value;
+ buffer[0] += value.x;
+ buffer[1] += value.y;
+ buffer[2] += value.z;
#endif
}
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+ float4 value)
{
#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
@@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
atomic_add_and_fetch_float(buf_z, value.z);
atomic_add_and_fetch_float(buf_w, value.w);
#else
- ccl_global float4 *buf = (ccl_global float4 *)buffer;
- *buf += value;
+ buffer[0] += value.x;
+ buffer[1] += value.y;
+ buffer[2] += value.z;
+ buffer[3] += value.w;
#endif
}
-#ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
{
- kernel_write_pass_float(buffer, value);
-
- /* The online one-pass variance update that's used for the megakernel can't easily be implemented
- * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
- kernel_write_pass_float(buffer + 1, value * value);
+ return *buffer;
}
-# ifdef __ATOMIC_PASS_WRITE__
-# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-# else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
{
- buffer[0] += value.x;
- buffer[1] += value.y;
- buffer[2] += value.z;
+ return make_float3(buffer[0], buffer[1], buffer[2]);
}
-# endif
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
{
- kernel_write_pass_float3_unaligned(buffer, value);
- kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+ return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
}
-#endif /* __DENOISING_FEATURES__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
deleted file mode 100644
index 145a6b6ac40..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-# define __KERNEL_SSE2__
-#endif
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-# ifdef __SSE2__
-# ifndef __KERNEL_SSE2__
-# define __KERNEL_SSE2__
-# endif
-# endif
-# ifdef __SSE3__
-# define __KERNEL_SSE3__
-# endif
-# ifdef __SSSE3__
-# define __KERNEL_SSSE3__
-# endif
-# ifdef __SSE4_1__
-# define __KERNEL_SSE41__
-# endif
-# ifdef __AVX__
-# define __KERNEL_SSE__
-# define __KERNEL_AVX__
-# endif
-# ifdef __AVX2__
-# define __KERNEL_SSE__
-# define __KERNEL_AVX2__
-# endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
deleted file mode 100644
index 012daba62d8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
deleted file mode 100644
index 16351a7f949..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
deleted file mode 100644
index 1423b182ab8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
- TileInfo *tile_info,
- int x,
- int y,
- float *unfilteredA,
- float *unfilteredB,
- float *sampleV,
- float *sampleVV,
- float *bufferV,
- int *prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
- TileInfo *tile_info,
- int m_offset,
- int v_offset,
- int x,
- int y,
- float *mean,
- float *variance,
- float scale,
- int *prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
- int x,
- int y,
- int *buffer_params,
- float *from,
- float *buffer,
- int out_offset,
- int *prefilter_rect);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
- int y,
- ccl_global float *image,
- ccl_global float *variance,
- ccl_global float *depth,
- ccl_global float *output,
- int *rect,
- int pass_stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
- int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
- TileInfo *tiles,
- int x,
- int y,
- int storage_ofs,
- float *transform,
- int *rank,
- int *rect,
- int pass_stride,
- int frame_stride,
- bool use_time,
- int radius,
- float pca_threshold);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
- int dy,
- float *weight_image,
- float *variance_image,
- float *scale_image,
- float *difference_image,
- int *rect,
- int stride,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
- float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
- float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
- int dy,
- float *difference_image,
- float *image,
- float *temp_image,
- float *out_image,
- float *accum_image,
- int *rect,
- int channel_offset,
- int stride,
- int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
- int dy,
- int t,
- float *difference_image,
- float *buffer,
- float *transform,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int *rect,
- int *filter_window,
- int stride,
- int f,
- int pass_stride,
- int frame_offset,
- bool use_time);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
- float *accum_image,
- int *rect,
- int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
- int y,
- int storage_ofs,
- float *buffer,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int *buffer_params,
- int sample);
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
deleted file mode 100644
index 3d4cb87e104..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-#include "kernel/kernel_compat_cpu.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-#ifdef KERNEL_STUB
-# define STUB_ASSERT(arch, name) \
- assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoise filter */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
- TileInfo *tile_info,
- int x,
- int y,
- float *unfilteredA,
- float *unfilteredB,
- float *sampleVariance,
- float *sampleVarianceV,
- float *bufferVariance,
- int *prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
-#else
- kernel_filter_divide_shadow(sample,
- tile_info,
- x,
- y,
- unfilteredA,
- unfilteredB,
- sampleVariance,
- sampleVarianceV,
- bufferVariance,
- load_int4(prefilter_rect),
- buffer_pass_stride,
- buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
- TileInfo *tile_info,
- int m_offset,
- int v_offset,
- int x,
- int y,
- float *mean,
- float *variance,
- float scale,
- int *prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
-#else
- kernel_filter_get_feature(sample,
- tile_info,
- m_offset,
- v_offset,
- x,
- y,
- mean,
- variance,
- scale,
- load_int4(prefilter_rect),
- buffer_pass_stride,
- buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
- int x,
- int y,
- int *buffer_params,
- float *from,
- float *buffer,
- int out_offset,
- int *prefilter_rect)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
-#else
- kernel_filter_write_feature(
- sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
- int y,
- ccl_global float *image,
- ccl_global float *variance,
- ccl_global float *depth,
- ccl_global float *output,
- int *rect,
- int pass_stride)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
-#else
- kernel_filter_detect_outliers(
- x, y, image, variance, depth, output, load_int4(rect), pass_stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
- int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
-#else
- kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
- TileInfo *tile_info,
- int x,
- int y,
- int storage_ofs,
- float *transform,
- int *rank,
- int *prefilter_rect,
- int pass_stride,
- int frame_stride,
- bool use_time,
- int radius,
- float pca_threshold)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
-#else
- rank += storage_ofs;
- transform += storage_ofs * TRANSFORM_SIZE;
- kernel_filter_construct_transform(buffer,
- tile_info,
- x,
- y,
- load_int4(prefilter_rect),
- pass_stride,
- frame_stride,
- use_time,
- transform,
- rank,
- radius,
- pca_threshold);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
- int dy,
- float *weight_image,
- float *variance_image,
- float *scale_image,
- float *difference_image,
- int *rect,
- int stride,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
-#else
- kernel_filter_nlm_calc_difference(dx,
- dy,
- weight_image,
- variance_image,
- scale_image,
- difference_image,
- load_int4(rect),
- stride,
- channel_offset,
- frame_offset,
- a,
- k_2);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
- float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
-#else
- kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
- float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
-#else
- kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
- int dy,
- float *difference_image,
- float *image,
- float *temp_image,
- float *out_image,
- float *accum_image,
- int *rect,
- int channel_offset,
- int stride,
- int f)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
-#else
- kernel_filter_nlm_update_output(dx,
- dy,
- difference_image,
- image,
- temp_image,
- out_image,
- accum_image,
- load_int4(rect),
- channel_offset,
- stride,
- f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
- int dy,
- int t,
- float *difference_image,
- float *buffer,
- float *transform,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int *rect,
- int *filter_window,
- int stride,
- int f,
- int pass_stride,
- int frame_offset,
- bool use_time)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
-#else
- kernel_filter_nlm_construct_gramian(dx,
- dy,
- t,
- difference_image,
- buffer,
- transform,
- rank,
- XtWX,
- XtWY,
- load_int4(rect),
- load_int4(filter_window),
- stride,
- f,
- pass_stride,
- frame_offset,
- use_time);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
- float *accum_image,
- int *rect,
- int stride)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
-#else
- kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
- int y,
- int storage_ofs,
- float *buffer,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int *buffer_params,
- int sample)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, filter_finalize);
-#else
- XtWX += storage_ofs * XTWX_SIZE;
- XtWY += storage_ofs * XTWY_SIZE;
- rank += storage_ofs;
- kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
-#endif
-}
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
deleted file mode 100644
index 75833d83648..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
deleted file mode 100644
index c998cd54d3a..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
deleted file mode 100644
index fc4ef1fca5b..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
deleted file mode 100644
index ea3103f12c3..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
- KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
- uchar4 *rgba,
- float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
- uchar4 *rgba,
- float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
- uint4 *input,
- float4 *output,
- int type,
- int filter,
- int i,
- int offset,
- int sample);
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
- KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-/* Split kernels */
-
-void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- int start_sample,
- int end_sample,
- int sx,
- int sy,
- int sw,
- int sh,
- int offset,
- int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer);
-
-#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
- void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data);
-
-DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
-DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
-DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
-DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
deleted file mode 100644
index 51d6c23f72f..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-
-#ifndef KERNEL_STUB
-# ifndef __SPLIT_KERNEL__
-# include "kernel/kernel_math.h"
-# include "kernel/kernel_types.h"
-
-# include "kernel/split/kernel_split_data.h"
-# include "kernel/kernel_globals.h"
-
-# include "kernel/kernel_color.h"
-# include "kernel/kernels/cpu/kernel_cpu_image.h"
-# include "kernel/kernel_film.h"
-# include "kernel/kernel_path.h"
-# include "kernel/kernel_path_branched.h"
-# include "kernel/kernel_bake.h"
-# else
-# include "kernel/split/kernel_split_common.h"
-
-# include "kernel/split/kernel_data_init.h"
-# include "kernel/split/kernel_path_init.h"
-# include "kernel/split/kernel_scene_intersect.h"
-# include "kernel/split/kernel_lamp_emission.h"
-# include "kernel/split/kernel_do_volume.h"
-# include "kernel/split/kernel_queue_enqueue.h"
-# include "kernel/split/kernel_indirect_background.h"
-# include "kernel/split/kernel_shader_setup.h"
-# include "kernel/split/kernel_shader_sort.h"
-# include "kernel/split/kernel_shader_eval.h"
-# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-# include "kernel/split/kernel_subsurface_scatter.h"
-# include "kernel/split/kernel_direct_lighting.h"
-# include "kernel/split/kernel_shadow_blocked_ao.h"
-# include "kernel/split/kernel_shadow_blocked_dl.h"
-# include "kernel/split/kernel_enqueue_inactive.h"
-# include "kernel/split/kernel_next_iteration_setup.h"
-# include "kernel/split/kernel_indirect_subsurface.h"
-# include "kernel/split/kernel_buffer_update.h"
-# include "kernel/split/kernel_adaptive_stopping.h"
-# include "kernel/split/kernel_adaptive_filter_x.h"
-# include "kernel/split/kernel_adaptive_filter_y.h"
-# include "kernel/split/kernel_adaptive_adjust_samples.h"
-# endif /* __SPLIT_KERNEL__ */
-#else
-# define STUB_ASSERT(arch, name) \
- assert(!(#name " kernel stub for architecture " #arch " was called!"))
-
-# ifdef __SPLIT_KERNEL__
-# include "kernel/split/kernel_data_init.h"
-# endif /* __SPLIT_KERNEL__ */
-#endif /* KERNEL_STUB */
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __SPLIT_KERNEL__
-
-/* Path Tracing */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
- KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, path_trace);
-# else
-# ifdef __BRANCHED_PATH__
- if (kernel_data.integrator.branched) {
- kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride);
- }
- else
-# endif
- {
- kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
- }
-# endif /* KERNEL_STUB */
-}
-
-/* Film */
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
- uchar4 *rgba,
- float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride)
-{
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
-# else
- kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-# endif /* KERNEL_STUB */
-}
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
- uchar4 *rgba,
- float *buffer,
- float sample_scale,
- int x,
- int y,
- int offset,
- int stride)
-{
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
-# else
- kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-# endif /* KERNEL_STUB */
-}
-
-/* Bake */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
- KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, bake);
-# else
-# ifdef __BAKING__
- kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-# endif
-# endif /* KERNEL_STUB */
-}
-
-/* Shader Evaluate */
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
- uint4 *input,
- float4 *output,
- int type,
- int filter,
- int i,
- int offset,
- int sample)
-{
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, shader);
-# else
- if (type == SHADER_EVAL_DISPLACE) {
- kernel_displace_evaluate(kg, input, output, i);
- }
- else {
- kernel_background_evaluate(kg, input, output, i);
- }
-# endif /* KERNEL_STUB */
-}
-
-#else /* __SPLIT_KERNEL__ */
-
-/* Split Kernel Path Tracing */
-
-# ifdef KERNEL_STUB
-# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
- void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
- { \
- STUB_ASSERT(KERNEL_ARCH, name); \
- }
-
-# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
- void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
- { \
- STUB_ASSERT(KERNEL_ARCH, name); \
- }
-# else
-# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
- void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
- { \
- kernel_##name(kg); \
- }
-
-# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
- void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
- { \
- ccl_local type locals; \
- kernel_##name(kg, &locals); \
- }
-# endif /* KERNEL_STUB */
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
- BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-#endif /* __SPLIT_KERNEL__ */
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
deleted file mode 100644
index 989f5e5aaa8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-# define __KERNEL_SSE2__
-#endif
-
-#define __SPLIT_KERNEL__
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-# ifdef __SSE2__
-# ifndef __KERNEL_SSE2__
-# define __KERNEL_SSE2__
-# endif
-# endif
-# ifdef __SSE3__
-# define __KERNEL_SSE3__
-# endif
-# ifdef __SSSE3__
-# define __KERNEL_SSSE3__
-# endif
-# ifdef __SSE4_1__
-# define __KERNEL_SSE41__
-# endif
-# ifdef __AVX__
-# define __KERNEL_AVX__
-# endif
-# ifdef __AVX2__
-# define __KERNEL_SSE__
-# define __KERNEL_AVX2__
-# endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
deleted file mode 100644
index 40e485d27c0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
deleted file mode 100644
index 8c44238470e..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
deleted file mode 100644
index 7a3f218d5fc..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
deleted file mode 100644
index 1cab59e0ea0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
deleted file mode 100644
index 637126d9d4c..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
- CCL_FILTER_TILE_INFO,
- int4 prefilter_rect,
- int buffer_pass_stride)
-{
- int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
- int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
- int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
- int itile = ytile * 3 + xtile;
- float *const in = ((float *)ccl_get_tile_buffer(itile)) +
- (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
- buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
- for (int i = 0; i < buffer_pass_stride; ++i)
- buffer[i] = in[i];
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sw && y < sh) {
- if (num_inputs > 0) {
- float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
- float *out = rgb + (x + y * sw) * 3;
- out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
- out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
- out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
- }
- if (num_inputs > 1) {
- float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
- float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
- out[0] = in[0] / num_samples;
- out[1] = in[1] / num_samples;
- out[2] = in[2] / num_samples;
- }
- if (num_inputs > 2) {
- float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
- float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
- out[0] = in[0] / num_samples;
- out[1] = in[1] / num_samples;
- out[2] = in[2] / num_samples;
- }
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < sw && y < sh) {
- float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
- float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
- out[0] = in[0] * num_samples;
- out[1] = in[1] * num_samples;
- out[2] = in[2] * num_samples;
- }
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
- CCL_FILTER_TILE_INFO,
- float *unfilteredA,
- float *unfilteredB,
- float *sampleVariance,
- float *sampleVarianceV,
- float *bufferVariance,
- int4 prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
- int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_divide_shadow(sample,
- tile_info,
- x, y,
- unfilteredA,
- unfilteredB,
- sampleVariance,
- sampleVarianceV,
- bufferVariance,
- prefilter_rect,
- buffer_pass_stride,
- buffer_denoising_offset);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
- CCL_FILTER_TILE_INFO,
- int m_offset,
- int v_offset,
- float *mean,
- float *variance,
- float scale,
- int4 prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
- int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_get_feature(sample,
- tile_info,
- m_offset, v_offset,
- x, y,
- mean, variance,
- scale,
- prefilter_rect,
- buffer_pass_stride,
- buffer_denoising_offset);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
- int4 buffer_params,
- int4 filter_area,
- float *from,
- float *buffer,
- int out_offset,
- int4 prefilter_rect)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < filter_area.z && y < filter_area.w) {
- kernel_filter_write_feature(sample,
- x + filter_area.x,
- y + filter_area.y,
- buffer_params,
- from,
- buffer,
- out_offset,
- prefilter_rect);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
- float *variance,
- float *depth,
- float *output,
- int4 prefilter_rect,
- int pass_stride)
-{
- int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
- int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
- int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
- int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
- CCL_FILTER_TILE_INFO,
- float *transform, int *rank,
- int4 filter_area, int4 rect,
- int radius, float pca_threshold,
- int pass_stride, int frame_stride,
- bool use_time)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < filter_area.z && y < filter_area.w) {
- int *l_rank = rank + y*filter_area.z + x;
- float *l_transform = transform + y*filter_area.z + x;
- kernel_filter_construct_transform(buffer,
- tile_info,
- x + filter_area.x, y + filter_area.y,
- rect,
- pass_stride, frame_stride,
- use_time,
- l_transform, l_rank,
- radius, pca_threshold,
- filter_area.z*filter_area.w,
- threadIdx.y*blockDim.x + threadIdx.x);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
- const float *ccl_restrict variance_image,
- const float *ccl_restrict scale_image,
- float *difference_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
- weight_image,
- variance_image,
- scale_image,
- difference_image + ofs,
- rect, stride,
- channel_offset,
- frame_offset,
- a, k_2);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
- float *out_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_blur(co.x, co.y,
- difference_image + ofs,
- out_image + ofs,
- rect, stride, f);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
- float *out_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_calc_weight(co.x, co.y,
- difference_image + ofs,
- out_image + ofs,
- rect, stride, f);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
- const float *ccl_restrict image,
- float *out_image,
- float *accum_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int channel_offset,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
- difference_image + ofs,
- image,
- out_image,
- accum_image,
- rect,
- channel_offset,
- stride, f);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
- const float *ccl_restrict accum_image,
- int w,
- int h,
- int stride)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < w && y < h) {
- kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
- const float *ccl_restrict difference_image,
- const float *ccl_restrict buffer,
- float const* __restrict__ transform,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int4 filter_window,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f,
- int frame_offset,
- bool use_time)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
- kernel_filter_nlm_construct_gramian(co.x, co.y,
- co.z, co.w,
- t,
- difference_image + ofs,
- buffer,
- transform, rank,
- XtWX, XtWY,
- rect, filter_window,
- stride, f,
- pass_stride,
- frame_offset,
- use_time,
- threadIdx.y*blockDim.x + threadIdx.x);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
- int *rank,
- float *XtWX,
- float3 *XtWY,
- int4 filter_area,
- int4 buffer_params,
- int sample)
-{
- int x = blockDim.x*blockIdx.x + threadIdx.x;
- int y = blockDim.y*blockIdx.y + threadIdx.y;
- if(x < filter_area.z && y < filter_area.w) {
- int storage_ofs = y*filter_area.z+x;
- rank += storage_ofs;
- XtWX += storage_ofs;
- XtWY += storage_ofs;
- kernel_filter_finalize(x, y, buffer, rank,
- filter_area.z*filter_area.w,
- XtWX, XtWY,
- buffer_params, sample);
- }
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
- int work_index = ccl_global_id(0);
- bool thread_is_active = work_index < total_work_size;
- uint x, y, sample;
- KernelGlobals kg;
- if(thread_is_active) {
- get_work_pixel(tile, work_index, &x, &y, &sample);
-
- kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
- }
-
- if(kernel_data.film.cryptomatte_passes) {
- __syncthreads();
- if(thread_is_active) {
- kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
- }
- }
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
- int work_index = ccl_global_id(0);
- bool thread_is_active = work_index < total_work_size;
- uint x, y, sample;
- KernelGlobals kg;
- if(thread_is_active) {
- get_work_pixel(tile, work_index, &x, &y, &sample);
-
- kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
- }
-
- if(kernel_data.film.cryptomatte_passes) {
- __syncthreads();
- if(thread_is_active) {
- kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
- }
- }
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
- int work_index = ccl_global_id(0);
- bool thread_is_active = work_index < total_work_size;
- KernelGlobals kg;
- if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
- uint x = tile->x + work_index % tile->w;
- uint y = tile->y + work_index / tile->w;
- int index = tile->offset + x + y * tile->stride;
- ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
- kernel_do_adaptive_stopping(&kg, buffer, sample);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
- KernelGlobals kg;
- if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
- if(ccl_global_id(0) < tile->h) {
- int y = tile->y + ccl_global_id(0);
- kernel_do_adaptive_filter_x(&kg, y, tile);
- }
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
- KernelGlobals kg;
- if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
- if(ccl_global_id(0) < tile->w) {
- int x = tile->x + ccl_global_id(0);
- kernel_do_adaptive_filter_y(&kg, x, tile);
- }
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
- if(kernel_data.film.pass_adaptive_aux_buffer) {
- int work_index = ccl_global_id(0);
- bool thread_is_active = work_index < total_work_size;
- KernelGlobals kg;
- if(thread_is_active) {
- uint x = tile->x + work_index % tile->w;
- uint y = tile->y + work_index / tile->w;
- int index = tile->offset + x + y * tile->stride;
- ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
- if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
- buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
- float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
- if(sample_multiplier != 1.0f) {
- kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
- }
- }
- else {
- kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
- }
- }
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
- if(x < sx + sw && y < sy + sh) {
- kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
- if(x < sx + sw && y < sy + sh) {
- kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
- float4 *output,
- int type,
- int sx,
- int sw,
- int offset,
- int sample)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
- if(x < sx + sw) {
- KernelGlobals kg;
- kernel_displace_evaluate(&kg, input, output, x);
- }
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
- float4 *output,
- int type,
- int sx,
- int sw,
- int offset,
- int sample)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
- if(x < sx + sw) {
- KernelGlobals kg;
- kernel_background_evaluate(&kg, input, output, x);
- }
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
- int work_index = ccl_global_id(0);
-
- if(work_index < total_work_size) {
- uint x, y, sample;
- get_work_pixel(tile, work_index, &x, &y, &sample);
-
- KernelGlobals kg;
- kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
- }
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-# define CUDA_KERNEL_MAX_REGISTERS 64
-# else
-# define CUDA_KERNEL_MAX_REGISTERS 48
-# endif
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 64
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
- __launch_bounds__(threads_block_width *threads_block_width, \
- CUDA_MULTIPRESSOR_MAX_REGISTERS / \
- (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-# error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
- (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
- CUDA_MULTIPROCESSOR_MAX_BLOCKS
-# error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
- *size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index,
- int queuesize,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
- ccl_global float *buffer)
-{
- kernel_data_init(NULL,
- NULL,
- split_data_buffer,
- num_elements,
- ray_state,
- start_sample,
- end_sample,
- sx, sy, sw, sh, offset, stride,
- Queue_index,
- queuesize,
- use_queues_flag,
- work_pool_wgs,
- num_samples,
- buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
- extern "C" __global__ void \
- CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
- kernel_cuda_##name() \
- { \
- kernel_##name(NULL); \
- }
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
- extern "C" __global__ void \
- CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
- kernel_cuda_##name() \
- { \
- ccl_local type locals; \
- kernel_##name(NULL, &locals); \
- }
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
- int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
- int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
deleted file mode 100644
index 996bc27f71b..00000000000
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-__kernel void kernel_ocl_filter_divide_shadow(int sample,
- CCL_FILTER_TILE_INFO,
- ccl_global float *unfilteredA,
- ccl_global float *unfilteredB,
- ccl_global float *sampleVariance,
- ccl_global float *sampleVarianceV,
- ccl_global float *bufferVariance,
- int4 prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int x = prefilter_rect.x + get_global_id(0);
- int y = prefilter_rect.y + get_global_id(1);
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_divide_shadow(sample,
- CCL_FILTER_TILE_INFO_ARG,
- x, y,
- unfilteredA,
- unfilteredB,
- sampleVariance,
- sampleVarianceV,
- bufferVariance,
- prefilter_rect,
- buffer_pass_stride,
- buffer_denoising_offset);
- }
-}
-
-__kernel void kernel_ocl_filter_get_feature(int sample,
- CCL_FILTER_TILE_INFO,
- int m_offset,
- int v_offset,
- ccl_global float *mean,
- ccl_global float *variance,
- float scale,
- int4 prefilter_rect,
- int buffer_pass_stride,
- int buffer_denoising_offset)
-{
- int x = prefilter_rect.x + get_global_id(0);
- int y = prefilter_rect.y + get_global_id(1);
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_get_feature(sample,
- CCL_FILTER_TILE_INFO_ARG,
- m_offset, v_offset,
- x, y,
- mean, variance,
- scale,
- prefilter_rect,
- buffer_pass_stride,
- buffer_denoising_offset);
- }
-}
-
-__kernel void kernel_ocl_filter_write_feature(int sample,
- int4 buffer_params,
- int4 filter_area,
- ccl_global float *from,
- ccl_global float *buffer,
- int out_offset,
- int4 prefilter_rect)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- if(x < filter_area.z && y < filter_area.w) {
- kernel_filter_write_feature(sample,
- x + filter_area.x,
- y + filter_area.y,
- buffer_params,
- from,
- buffer,
- out_offset,
- prefilter_rect);
- }
-}
-
-__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
- ccl_global float *variance,
- ccl_global float *depth,
- ccl_global float *output,
- int4 prefilter_rect,
- int pass_stride)
-{
- int x = prefilter_rect.x + get_global_id(0);
- int y = prefilter_rect.y + get_global_id(1);
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
- }
-}
-
-__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
- ccl_global float *variance,
- ccl_global float *a,
- ccl_global float *b,
- int4 prefilter_rect,
- int r)
-{
- int x = prefilter_rect.x + get_global_id(0);
- int y = prefilter_rect.y + get_global_id(1);
- if(x < prefilter_rect.z && y < prefilter_rect.w) {
- kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
- }
-}
-
-__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
- CCL_FILTER_TILE_INFO,
- ccl_global float *transform,
- ccl_global int *rank,
- int4 filter_area,
- int4 rect,
- int pass_stride,
- int frame_stride,
- char use_time,
- int radius,
- float pca_threshold)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- if(x < filter_area.z && y < filter_area.w) {
- ccl_global int *l_rank = rank + y*filter_area.z + x;
- ccl_global float *l_transform = transform + y*filter_area.z + x;
- kernel_filter_construct_transform(buffer,
- CCL_FILTER_TILE_INFO_ARG,
- x + filter_area.x, y + filter_area.y,
- rect,
- pass_stride, frame_stride,
- use_time,
- l_transform, l_rank,
- radius, pca_threshold,
- filter_area.z*filter_area.w,
- get_local_id(1)*get_local_size(0) + get_local_id(0));
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
- const ccl_global float *ccl_restrict variance_image,
- const ccl_global float *ccl_restrict scale_image,
- ccl_global float *difference_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int channel_offset,
- int frame_offset,
- float a,
- float k_2)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
- weight_image,
- variance_image,
- scale_image,
- difference_image + ofs,
- rect, stride,
- channel_offset,
- frame_offset,
- a, k_2);
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
- ccl_global float *out_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_blur(co.x, co.y,
- difference_image + ofs,
- out_image + ofs,
- rect, stride, f);
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
- ccl_global float *out_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_calc_weight(co.x, co.y,
- difference_image + ofs,
- out_image + ofs,
- rect, stride, f);
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
- const ccl_global float *ccl_restrict image,
- ccl_global float *out_image,
- ccl_global float *accum_image,
- int w,
- int h,
- int stride,
- int pass_stride,
- int channel_offset,
- int r,
- int f)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
- kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
- difference_image + ofs,
- image,
- out_image,
- accum_image,
- rect,
- channel_offset,
- stride, f);
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
- const ccl_global float *ccl_restrict accum_image,
- int w,
- int h,
- int stride)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- if(x < w && y < h) {
- kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
- }
-}
-
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
- const ccl_global float *ccl_restrict difference_image,
- const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict transform,
- ccl_global int *rank,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 filter_window,
- int w,
- int h,
- int stride,
- int pass_stride,
- int r,
- int f,
- int frame_offset,
- char use_time)
-{
- int4 co, rect;
- int ofs;
- if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
- kernel_filter_nlm_construct_gramian(co.x, co.y,
- co.z, co.w,
- t,
- difference_image + ofs,
- buffer,
- transform, rank,
- XtWX, XtWY,
- rect, filter_window,
- stride, f,
- pass_stride,
- frame_offset,
- use_time,
- get_local_id(1)*get_local_size(0) + get_local_id(0));
- }
-}
-
-__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
- ccl_global int *rank,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 filter_area,
- int4 buffer_params,
- int sample)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- if(x < filter_area.z && y < filter_area.w) {
- int storage_ofs = y*filter_area.z+x;
- rank += storage_ofs;
- XtWX += storage_ofs;
- XtWY += storage_ofs;
- kernel_filter_finalize(x, y, buffer, rank,
- filter_area.z*filter_area.w,
- XtWX, XtWY,
- buffer_params, sample);
- }
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
deleted file mode 100644
index ebdb99d4730..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#define KERNEL_NAME adaptive_adjust_samples
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
deleted file mode 100644
index 76d82d4184e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-
-#define KERNEL_NAME adaptive_filter_x
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
deleted file mode 100644
index 1e6d15ba0f2..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-
-#define KERNEL_NAME adaptive_filter_y
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
deleted file mode 100644
index 51de0059667..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-
-#define KERNEL_NAME adaptive_stopping
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
deleted file mode 100644
index 0e600676e82..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_background(
- ccl_constant KernelData *data,
- ccl_global uint4 *input,
- ccl_global float4 *output,
-
- KERNEL_BUFFER_PARAMS,
-
- int type, int sx, int sw, int offset, int sample)
-{
- KernelGlobals kglobals, *kg = &kglobals;
-
- kg->data = data;
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-
- int x = sx + ccl_global_id(0);
-
- if(x < sx + sw) {
- kernel_background_evaluate(kg, input, output, x);
- }
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
deleted file mode 100644
index 7b81e387467..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_bake(
- ccl_constant KernelData *data,
- ccl_global float *buffer,
-
- KERNEL_BUFFER_PARAMS,
-
- int sx, int sy, int sw, int sh, int offset, int stride, int sample)
-{
- KernelGlobals kglobals, *kg = &kglobals;
-
- kg->data = data;
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-
- int x = sx + ccl_global_id(0);
- int y = sy + ccl_global_id(1);
-
- if(x < sx + sw && y < sy + sh) {
-#ifndef __NO_BAKING__
- kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#endif
- }
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl
deleted file mode 100644
index 1c2d89e8a92..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL base kernels entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-
-#include "kernel/kernel_film.h"
-
-
-__kernel void kernel_ocl_convert_to_byte(
- ccl_constant KernelData *data,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
-
- KERNEL_BUFFER_PARAMS,
-
- float sample_scale,
- int sx, int sy, int sw, int sh, int offset, int stride)
-{
- KernelGlobals kglobals, *kg = &kglobals;
-
- kg->data = data;
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-
- int x = sx + ccl_global_id(0);
- int y = sy + ccl_global_id(1);
-
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
- ccl_constant KernelData *data,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
-
- KERNEL_BUFFER_PARAMS,
-
- float sample_scale,
- int sx, int sy, int sw, int sh, int offset, int stride)
-{
- KernelGlobals kglobals, *kg = &kglobals;
-
- kg->data = data;
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-
- int x = sx + ccl_global_id(0);
- int y = sy + ccl_global_id(1);
-
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
-{
- size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
- if(i < size / sizeof(float4)) {
- buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- else if(i == size / sizeof(float4)) {
- ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
-
- for(i = 0; i < size % sizeof(float4); i++) {
- *(b++) = 0;
- }
- }
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
deleted file mode 100644
index 7125348a49f..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-
-__kernel void kernel_ocl_path_trace_data_init(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
- KERNEL_BUFFER_PARAMS,
- int start_sample,
- int end_sample,
- int sx, int sy, int sw, int sh, int offset, int stride,
- ccl_global int *Queue_index, /* Tracks the number of elements in queues */
- int queuesize, /* size (capacity) of the queue */
- ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
- ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
- unsigned int num_samples, /* Total number of samples per pixel */
- ccl_global float *buffer)
-{
- kernel_data_init((KernelGlobals*)kg,
- data,
- split_data_buffer,
- num_elements,
- ray_state,
- KERNEL_BUFFER_ARGS,
- start_sample,
- end_sample,
- sx, sy, sw, sh, offset, stride,
- Queue_index,
- queuesize,
- use_queues_flag,
- work_pool_wgs,
- num_samples,
- buffer);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
deleted file mode 100644
index 76cc36971f5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_displace(
- ccl_constant KernelData *data,
- ccl_global uint4 *input,
- ccl_global float4 *output,
-
- KERNEL_BUFFER_PARAMS,
-
- int type, int sx, int sw, int offset, int sample)
-{
- KernelGlobals kglobals, *kg = &kglobals;
-
- kg->data = data;
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-
- int x = sx + ccl_global_id(0);
-
- if(x < sx + sw) {
- kernel_displace_evaluate(kg, input, output, x);
- }
-}
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
deleted file mode 100644
index 8b1332bf013..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-
-#define KERNEL_NAME next_iteration_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
deleted file mode 100644
index bb6b8a40e8e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is
- * safer and more portable to never use double datatype on GPU.
- * Use a special structure, so that the following is true:
- * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type
- * (which rules out trick like using `uint64_t` as a drop-in replacement for double).
- * - Padding rules are matching exactly `double`
- * (which rules out array of `uint8_t`). */
-typedef struct ccl_vdb_double_t {
- uint64_t i;
-} ccl_vdb_double_t;
-
-# define double ccl_vdb_double_t
-# include "nanovdb/CNanoVDB.h"
-# undef double
-#endif
-
-/* For OpenCL we do manual lookup and interpolation. */
-
-ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id)
-{
- const uint tex_offset = id
-#define KERNEL_TEX(type, name) +1
-#include "kernel/kernel_textures.h"
- ;
-
- return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset];
-}
-
-#define tex_fetch(type, info, index) \
- ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
- x %= width;
- if (x < 0)
- x += width;
- return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
- return clamp(x, 0, width - 1);
-}
-
-ccl_device_inline float4 svm_image_texture_read(
- KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z)
-{
- const int data_offset = x + info->width * y + info->width * info->height * z;
- const int texture_type = info->data_type;
-
- /* Float4 */
- if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
- return tex_fetch(float4, info, data_offset);
- }
- /* Byte4 */
- else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
- uchar4 r = tex_fetch(uchar4, info, data_offset);
- float f = 1.0f / 255.0f;
- return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
- }
- /* Ushort4 */
- else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
- ushort4 r = tex_fetch(ushort4, info, data_offset);
- float f = 1.0f / 65535.f;
- return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
- }
- /* Float */
- else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
- float f = tex_fetch(float, info, data_offset);
- return make_float4(f, f, f, 1.0f);
- }
- /* UShort */
- else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
- ushort r = tex_fetch(ushort, info, data_offset);
- float f = r * (1.0f / 65535.0f);
- return make_float4(f, f, f, 1.0f);
- }
-#ifdef WITH_NANOVDB
- /* NanoVDB Float */
- else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
- cnanovdb_coord coord;
- coord.mVec[0] = x;
- coord.mVec[1] = y;
- coord.mVec[2] = z;
- float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord);
- return make_float4(f, f, f, 1.0f);
- }
- /* NanoVDB Float3 */
- else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
- cnanovdb_coord coord;
- coord.mVec[0] = x;
- coord.mVec[1] = y;
- coord.mVec[2] = z;
- cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord);
- return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f);
- }
-#endif
-#ifdef __KERNEL_CL_KHR_FP16__
- /* Half and Half4 are optional in OpenCL */
- else if (texture_type == IMAGE_DATA_TYPE_HALF) {
- float f = tex_fetch(half, info, data_offset);
- return make_float4(f, f, f, 1.0f);
- }
- else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
- half4 r = tex_fetch(half4, info, data_offset);
- return make_float4(r.x, r.y, r.z, r.w);
- }
-#endif
- /* Byte */
- else {
- uchar r = tex_fetch(uchar, info, data_offset);
- float f = r * (1.0f / 255.0f);
- return make_float4(f, f, f, 1.0f);
- }
-}
-
-ccl_device_inline float4
-svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y)
-{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
- if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
- info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
- /* Wrap */
- if (info->extension == EXTENSION_REPEAT) {
- x = svm_image_texture_wrap_periodic(x, info->width);
- y = svm_image_texture_wrap_periodic(y, info->height);
- }
- else {
- x = svm_image_texture_wrap_clamp(x, info->width);
- y = svm_image_texture_wrap_clamp(y, info->height);
- }
-#ifdef WITH_NANOVDB
- }
-#endif
-
- return svm_image_texture_read(kg, info, acc, x, y, 0);
-}
-
-ccl_device_inline float4
-svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z)
-{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
- if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
- info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
- /* Wrap */
- if (info->extension == EXTENSION_REPEAT) {
- x = svm_image_texture_wrap_periodic(x, info->width);
- y = svm_image_texture_wrap_periodic(y, info->height);
- z = svm_image_texture_wrap_periodic(z, info->depth);
- }
- else {
- x = svm_image_texture_wrap_clamp(x, info->width);
- y = svm_image_texture_wrap_clamp(y, info->height);
- z = svm_image_texture_wrap_clamp(z, info->depth);
- }
-#ifdef WITH_NANOVDB
- }
-#endif
-
- return svm_image_texture_read(kg, info, acc, x, y, z);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
- int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
- *ix = i;
- return x - (float)i;
-}
-
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
- { \
- u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
- u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
- u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
- u[3] = (1.0f / 6.0f) * t * t * t; \
- } \
- (void)0
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
- if (info->extension == EXTENSION_CLIP) {
- if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
-
- if (info->interpolation == INTERPOLATION_CLOSEST) {
- /* Closest interpolation. */
- int ix, iy;
- svm_image_texture_frac(x * info->width, &ix);
- svm_image_texture_frac(y * info->height, &iy);
-
- return svm_image_texture_read_2d(kg, id, NULL, ix, iy);
- }
- else if (info->interpolation == INTERPOLATION_LINEAR) {
- /* Bilinear interpolation. */
- int ix, iy;
- float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
- float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
- float4 r;
- r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy);
- r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy);
- r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1);
- r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1);
- return r;
- }
- else {
- /* Bicubic interpolation. */
- int ix, iy;
- float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
- float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
- float u[4], v[4];
- SET_CUBIC_SPLINE_WEIGHTS(u, tx);
- SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
- float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
- for (int y = 0; y < 4; y++) {
- for (int x = 0; x < 4; x++) {
- float weight = u[x] * v[y];
- r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1);
- }
- }
- return r;
- }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
-{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
- if (info->use_transform_3d) {
- Transform tfm = info->transform_3d;
- P = transform_point(&tfm, P);
- }
-
- float x = P.x;
- float y = P.y;
- float z = P.z;
-
- uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp;
-
-#ifdef WITH_NANOVDB
- cnanovdb_readaccessor acc;
- if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
- info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
- ccl_global cnanovdb_griddata *grid =
- (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data);
- cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid)));
- }
- else {
- if (info->extension == EXTENSION_CLIP) {
- if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
-
- x *= info->width;
- y *= info->height;
- z *= info->depth;
- }
-# define NANOVDB_ACCESS_POINTER &acc
-#else
-# define NANOVDB_ACCESS_POINTER NULL
-#endif
-
- if (interpolation == INTERPOLATION_CLOSEST) {
- /* Closest interpolation. */
- int ix, iy, iz;
- svm_image_texture_frac(x, &ix);
- svm_image_texture_frac(y, &iy);
- svm_image_texture_frac(z, &iz);
-
- return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
- }
- else if (interpolation == INTERPOLATION_LINEAR) {
- /* Trilinear interpolation. */
- int ix, iy, iz;
- float tx = svm_image_texture_frac(x - 0.5f, &ix);
- float ty = svm_image_texture_frac(y - 0.5f, &iy);
- float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
- float4 r;
- r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
- r += (1.0f - tz) * (1.0f - ty) * tx *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz);
- r += (1.0f - tz) * ty * (1.0f - tx) *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz);
- r += (1.0f - tz) * ty * tx *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz);
-
- r += tz * (1.0f - ty) * (1.0f - tx) *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1);
- r += tz * (1.0f - ty) * tx *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1);
- r += tz * ty * (1.0f - tx) *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1);
- r += tz * ty * tx *
- svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1);
- return r;
- }
- else {
- /* Tricubic interpolation. */
- int ix, iy, iz;
- float tx = svm_image_texture_frac(x - 0.5f, &ix);
- float ty = svm_image_texture_frac(y - 0.5f, &iy);
- float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
- float u[4], v[4], w[4];
- SET_CUBIC_SPLINE_WEIGHTS(u, tx);
- SET_CUBIC_SPLINE_WEIGHTS(v, ty);
- SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
- float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
- for (int z = 0; z < 4; z++) {
- for (int y = 0; y < 4; y++) {
- for (int x = 0; x < 4; x++) {
- float weight = u[x] * v[y] * w[z];
- r += weight * svm_image_texture_read_3d(
- kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1);
- }
- }
- }
- return r;
- }
-#undef NANOVDB_ACCESS_POINTER
-}
-
-#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
deleted file mode 100644
index 68ee6f1d536..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-
-#define KERNEL_NAME queue_enqueue
-#define LOCALS_TYPE QueueEnqueueLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
deleted file mode 100644
index 10d09377ba9..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_scene_intersect.h"
-
-#define KERNEL_NAME scene_intersect
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
deleted file mode 100644
index 40eaa561863..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_eval.h"
-
-#define KERNEL_NAME shader_eval
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
deleted file mode 100644
index 8c36100f762..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_setup.h"
-
-#define KERNEL_NAME shader_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
deleted file mode 100644
index bcacaa4a054..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_sort.h"
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-#define KERNEL_NAME shader_sort
-#define LOCALS_TYPE ShaderSortLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
deleted file mode 100644
index 8de250a375c..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-
-#define KERNEL_NAME shadow_blocked_ao
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
deleted file mode 100644
index 29da77022ed..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-
-#define KERNEL_NAME shadow_blocked_dl
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
deleted file mode 100644
index c3b7b09460a..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h" // PRECOMPILED
-#include "kernel/split/kernel_split_common.h" // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
deleted file mode 100644
index e123b4cd6ec..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define KERNEL_NAME_JOIN(a, b) a##_##b
-#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
-
-__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace,
- KERNEL_NAME)(ccl_global char *kg_global,
- ccl_constant KernelData *data,
-
- ccl_global void *split_data_buffer,
- ccl_global char *ray_state,
-
- KERNEL_BUFFER_PARAMS,
-
- ccl_global int *queue_index,
- ccl_global char *use_queues_flag,
- ccl_global unsigned int *work_pools,
- ccl_global float *buffer)
-{
-#ifdef LOCALS_TYPE
- ccl_local LOCALS_TYPE locals;
-#endif
-
- KernelGlobals *kg = (KernelGlobals *)kg_global;
-
- if (ccl_local_id(0) + ccl_local_id(1) == 0) {
- kg->data = data;
-
- kernel_split_params.queue_index = queue_index;
- kernel_split_params.use_queues_flag = use_queues_flag;
- kernel_split_params.work_pools = work_pools;
- kernel_split_params.tile.buffer = buffer;
-
- split_data_init(kg,
- &kernel_split_state,
- ccl_global_size(0) * ccl_global_size(1),
- split_data_buffer,
- ray_state);
- }
-
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-
- KERNEL_NAME_EVAL(kernel, KERNEL_NAME)
- (kg
-#ifdef LOCALS_TYPE
- ,
- &locals
-#endif
- );
-}
-
-#undef KERNEL_NAME_JOIN
-#undef KERNEL_NAME_EVAL
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
deleted file mode 100644
index 2b3be38df84..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-
-#define KERNEL_NAME subsurface_scatter
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 3f9de5ab33d..8e497986dcc 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -37,7 +37,7 @@
#include "kernel/osl/osl_closures.h"
// clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
#include "kernel/closure/alloc.h"
#include "kernel/closure/emissive.h"
// clang-format on
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 76a2e41abfa..a2f9d3f759a 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,7 +34,7 @@
#include <OSL/genclosure.h>
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
#include "kernel/osl/osl_closures.h"
// clang-format off
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index b78dc8a3a67..812c3b6e71b 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,7 +34,7 @@
#include <OSL/genclosure.h>
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
#include "kernel/osl/osl_closures.h"
// clang-format off
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index d656723bac2..80dfbee879e 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -37,7 +37,7 @@
#include "kernel/osl/osl_closures.h"
// clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
#include "kernel/kernel_types.h"
#include "kernel/closure/alloc.h"
#include "kernel/closure/emissive.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index c5ca8616fbd..5d968ed85e0 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,7 +32,7 @@
#include <OSL/genclosure.h>
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
#include "kernel/osl/osl_closures.h"
// clang-format off
@@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN
using namespace OSL;
-static ustring u_cubic("cubic");
-static ustring u_gaussian("gaussian");
-static ustring u_burley("burley");
-static ustring u_principled("principled");
+static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
static ustring u_random_walk("random_walk");
-static ustring u_principled_random_walk("principled_random_walk");
class CBSSRDFClosure : public CClosurePrimitive {
public:
Bssrdf params;
+ float ior;
ustring method;
CBSSRDFClosure()
{
- params.texture_blur = 0.0f;
- params.sharpness = 0.0f;
- params.roughness = 0.0f;
+ params.roughness = FLT_MAX;
+ params.anisotropy = 1.0f;
+ ior = 1.4f;
}
void setup(ShaderData *sd, int path_flag, float3 weight)
{
- if (method == u_cubic) {
- alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
- }
- else if (method == u_gaussian) {
- alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
- }
- else if (method == u_burley) {
- alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
- }
- else if (method == u_principled) {
- alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+ if (method == u_random_walk_fixed_radius) {
+ alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
}
else if (method == u_random_walk) {
alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
}
- else if (method == u_principled_random_walk) {
- alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
- }
}
void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
@@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive {
/* create one closure per color channel */
bssrdf->radius = params.radius;
bssrdf->albedo = params.albedo;
- bssrdf->texture_blur = params.texture_blur;
- bssrdf->sharpness = params.sharpness;
bssrdf->N = params.N;
bssrdf->roughness = params.roughness;
- sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+ bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
+ sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
}
}
};
@@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params()
CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
- CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
- CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+ CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
+ CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 7ee467a46dd..e814fcca246 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -40,10 +40,10 @@
#include "util/util_param.h"
// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
#include "kernel/kernel_types.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
#include "kernel/kernel_montecarlo.h"
#include "kernel/kernel_random.h"
@@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
{
/* caustic options */
if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
(!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2b7c21d0bc4..396f42080e4 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -40,22 +40,22 @@
#include "util/util_string.h"
// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_projection.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/image.h"
+
#include "kernel/kernel_differential.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernels/cpu/kernel_cpu_image.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+
#include "kernel/geom/geom.h"
#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_color.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
#include "kernel/kernel_projection.h"
-#include "kernel/kernel_accumulate.h"
#include "kernel/kernel_shader.h"
// clang-format on
@@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
* a concept of shader space, so we just use object space for both. */
if (xform) {
const ShaderData *sd = (const ShaderData *)xform;
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
int object = sd->object;
if (object != OBJECT_NONE) {
@@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
Transform tfm;
if (time == sd->time)
- tfm = sd->ob_tfm;
+ tfm = object_get_transform(kg, sd);
else
tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
#else
- Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+ const Transform tfm = object_get_transform(kg, sd);
#endif
copy_matrix(result, tfm);
return true;
}
else if (sd->type == PRIMITIVE_LAMP) {
- copy_matrix(result, sd->ob_tfm);
+ const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+ copy_matrix(result, tfm);
return true;
}
@@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
* a concept of shader space, so we just use object space for both. */
if (xform) {
const ShaderData *sd = (const ShaderData *)xform;
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
int object = sd->object;
if (object != OBJECT_NONE) {
@@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
Transform itfm;
if (time == sd->time)
- itfm = sd->ob_itfm;
+ itfm = object_get_inverse_transform(kg, sd);
else
object_fetch_transform_motion_test(kg, object, time, &itfm);
#else
- Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+ const Transform itfm = object_get_inverse_transform(kg, sd);
#endif
copy_matrix(result, itfm);
return true;
}
else if (sd->type == PRIMITIVE_LAMP) {
- copy_matrix(result, sd->ob_itfm);
+ const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+ copy_matrix(result, itfm);
return true;
}
@@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
float time)
{
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if (from == u_ndc) {
copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
float time)
{
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if (to == u_ndc) {
copy_matrix(result, kernel_data.cam.worldtondc);
@@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
* a concept of shader space, so we just use object space for both. */
if (xform) {
const ShaderData *sd = (const ShaderData *)xform;
+ const KernelGlobals *kg = sd->osl_globals;
int object = sd->object;
if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_tfm;
-#else
- KernelGlobals *kg = sd->osl_globals;
- Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
+ const Transform tfm = object_get_transform(kg, sd);
copy_matrix(result, tfm);
return true;
}
else if (sd->type == PRIMITIVE_LAMP) {
- copy_matrix(result, sd->ob_tfm);
+ const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+ copy_matrix(result, tfm);
return true;
}
@@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
* a concept of shader space, so we just use object space for both. */
if (xform) {
const ShaderData *sd = (const ShaderData *)xform;
+ const KernelGlobals *kg = sd->osl_globals;
int object = sd->object;
if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
- Transform tfm = sd->ob_itfm;
-#else
- KernelGlobals *kg = sd->osl_globals;
- Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-#endif
+ const Transform tfm = object_get_inverse_transform(kg, sd);
copy_matrix(result, tfm);
return true;
}
else if (sd->type == PRIMITIVE_LAMP) {
- copy_matrix(result, sd->ob_itfm);
+ const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+ copy_matrix(result, itfm);
return true;
}
@@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
{
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if (from == u_ndc) {
copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
ustring to)
{
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if (to == u_ndc) {
copy_matrix(result, kernel_data.cam.worldtondc);
@@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
return false;
}
-static bool get_primitive_attribute(KernelGlobals *kg,
+static bool get_primitive_attribute(const KernelGlobals *kg,
const ShaderData *sd,
const OSLGlobals::Attribute &attr,
const TypeDesc &type,
@@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
}
}
-static bool get_mesh_attribute(KernelGlobals *kg,
+static bool get_mesh_attribute(const KernelGlobals *kg,
const ShaderData *sd,
const OSLGlobals::Attribute &attr,
const TypeDesc &type,
@@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
}
}
-bool OSLRenderServices::get_object_standard_attribute(
- KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg,
+ ShaderData *sd,
+ ustring name,
+ TypeDesc type,
+ bool derivatives,
+ void *val)
{
/* todo: turn this into hash table? */
@@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute(
return false;
}
-bool OSLRenderServices::get_background_attribute(
- KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg,
+ ShaderData *sd,
+ ustring name,
+ TypeDesc type,
+ bool derivatives,
+ void *val)
{
if (name == u_path_ray_length) {
/* Ray Length */
@@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute(
}
else if (name == u_path_ray_depth) {
/* Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->bounce;
+ const IntegratorStateCPU *state = sd->osl_path_state;
+ int f = state->path.bounce;
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_path_diffuse_depth) {
/* Diffuse Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->diffuse_bounce;
+ const IntegratorStateCPU *state = sd->osl_path_state;
+ int f = state->path.diffuse_bounce;
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_path_glossy_depth) {
/* Glossy Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->glossy_bounce;
+ const IntegratorStateCPU *state = sd->osl_path_state;
+ int f = state->path.glossy_bounce;
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_path_transmission_depth) {
/* Transmission Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->transmission_bounce;
+ const IntegratorStateCPU *state = sd->osl_path_state;
+ int f = state->path.transmission_bounce;
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_path_transparent_depth) {
/* Transparent Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->transparent_bounce;
- return set_attribute_int(f, type, derivatives, val);
- }
- else if (name == u_path_transmission_depth) {
- /* Transmission Ray Depth */
- PathState *state = sd->osl_path_state;
- int f = state->transmission_bounce;
+ const IntegratorStateCPU *state = sd->osl_path_state;
+ int f = state->path.transparent_bounce;
return set_attribute_int(f, type, derivatives, val);
}
else if (name == u_ndc) {
@@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute(
ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
if (derivatives) {
- ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0];
- ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0];
+ ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
+ ndc[0];
+ ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
+ ndc[0];
}
}
else {
@@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg,
bool OSLRenderServices::get_attribute(
ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
{
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
int prim_type = 0;
int object;
@@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename,
OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kernel_globals = sd->osl_globals;
+ const KernelGlobals *kernel_globals = sd->osl_globals;
bool status = false;
switch (texture_type) {
case OSLTextureHandle::BEVEL: {
/* Bevel shader hack. */
if (nchannels >= 3) {
- PathState *state = sd->osl_path_state;
+ const IntegratorStateCPU *state = sd->osl_path_state;
int num_samples = (int)s;
float radius = t;
- float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
+ float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
result[0] = N.x;
result[1] = N.y;
result[2] = N.z;
@@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename,
}
case OSLTextureHandle::AO: {
/* AO shader hack. */
- PathState *state = sd->osl_path_state;
+ const IntegratorStateCPU *state = sd->osl_path_state;
int num_samples = (int)s;
float radius = t;
float3 N = make_float3(dsdx, dtdx, dsdy);
@@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename,
if ((int)options.tblur) {
flags |= NODE_AO_GLOBAL_RADIUS;
}
- result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
+ result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
status = true;
break;
}
@@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename,
case OSLTextureHandle::SVM: {
/* Packed texture. */
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kernel_globals = sd->osl_globals;
+ const KernelGlobals *kernel_globals = sd->osl_globals;
int slot = handle->svm_slot;
float3 P_float3 = make_float3(P.x, P.y, P.z);
float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
@@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename,
if (handle && handle->oiio_handle) {
if (texture_thread_info == NULL) {
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kernel_globals = sd->osl_globals;
+ const KernelGlobals *kernel_globals = sd->osl_globals;
OSLThreadData *tdata = kernel_globals->osl_tdata;
texture_thread_info = tdata->oiio_thread_info;
}
@@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename,
if (handle && handle->oiio_handle) {
if (thread_info == NULL) {
ShaderData *sd = (ShaderData *)(sg->renderstate);
- KernelGlobals *kernel_globals = sd->osl_globals;
+ const KernelGlobals *kernel_globals = sd->osl_globals;
OSLThreadData *tdata = kernel_globals->osl_tdata;
thread_info = tdata->oiio_thread_info;
}
@@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options,
}
/* ray differentials */
- ray.dP.dx = TO_FLOAT3(dPdx);
- ray.dP.dy = TO_FLOAT3(dPdy);
- ray.dD.dx = TO_FLOAT3(dRdx);
- ray.dD.dy = TO_FLOAT3(dRdy);
+ differential3 dP;
+ dP.dx = TO_FLOAT3(dPdx);
+ dP.dy = TO_FLOAT3(dPdy);
+ ray.dP = differential_make_compact(dP);
+ differential3 dD;
+ dD.dx = TO_FLOAT3(dRdx);
+ dD.dy = TO_FLOAT3(dRdy);
+ ray.dD = differential_make_compact(dD);
/* allocate trace data */
OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata;
@@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
tracedata->hit = false;
tracedata->sd.osl_globals = sd->osl_globals;
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
/* Can't raytrace from shaders like displacement, before BVH exists. */
if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
}
else {
ShaderData *sd = &tracedata->sd;
- KernelGlobals *kg = sd->osl_globals;
+ const KernelGlobals *kg = sd->osl_globals;
if (!tracedata->setup) {
/* lazy shader data setup */
- shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray);
+ shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect);
tracedata->setup = true;
}
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 891b9172dd4..58accb46e7d 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices {
void *data) override;
#endif
- static bool get_background_attribute(
- KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
- static bool get_object_standard_attribute(
- KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
+ static bool get_background_attribute(const KernelGlobals *kg,
+ ShaderData *sd,
+ ustring name,
+ TypeDesc type,
+ bool derivatives,
+ void *val);
+ static bool get_object_standard_attribute(const KernelGlobals *kg,
+ ShaderData *sd,
+ ustring name,
+ TypeDesc type,
+ bool derivatives,
+ void *val);
static ustring u_distance;
static ustring u_index;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 389c854c495..880ef635c76 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -17,14 +17,16 @@
#include <OSL/oslexec.h>
// clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
#include "kernel/kernel_montecarlo.h"
#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
#include "kernel/geom/geom_object.h"
+#include "kernel/integrator/integrator_state.h"
+
#include "kernel/osl/osl_closures.h"
#include "kernel/osl/osl_globals.h"
#include "kernel/osl/osl_services.h"
@@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN
/* Threads */
-void OSLShader::thread_init(KernelGlobals *kg,
- KernelGlobals *kernel_globals,
- OSLGlobals *osl_globals)
+void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals)
{
/* no osl used? */
if (!osl_globals->use) {
@@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg)
/* Globals */
-static void shaderdata_to_shaderglobals(
- KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata)
+static void shaderdata_to_shaderglobals(const KernelGlobals *kg,
+ ShaderData *sd,
+ const IntegratorStateCPU *state,
+ int path_flag,
+ OSLThreadData *tdata)
{
OSL::ShaderGlobals *globals = &tdata->globals;
@@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_surface(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_background(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
}
}
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_volume(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
/* Displacement */
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
+void OSLShader::eval_displacement(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd)
{
/* setup shader globals from shader data */
OSLThreadData *tdata = kg->osl_tdata;
@@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *
/* Attributes */
-int OSLShader::find_attribute(KernelGlobals *kg,
+int OSLShader::find_attribute(const KernelGlobals *kg,
const ShaderData *sd,
uint id,
AttributeDescriptor *desc)
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index a4fa24d0a90..f1f17b141eb 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -37,6 +37,7 @@ class Scene;
struct ShaderClosure;
struct ShaderData;
+struct IntegratorStateCPU;
struct differential3;
struct KernelGlobals;
@@ -49,19 +50,28 @@ class OSLShader {
static void register_closures(OSLShadingSystem *ss);
/* per thread data */
- static void thread_init(KernelGlobals *kg,
- KernelGlobals *kernel_globals,
- OSLGlobals *osl_globals);
+ static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals);
static void thread_free(KernelGlobals *kg);
/* eval */
- static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
- static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
- static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
- static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
+ static void eval_surface(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag);
+ static void eval_background(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag);
+ static void eval_volume(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd,
+ int path_flag);
+ static void eval_displacement(const KernelGlobals *kg,
+ const IntegratorStateCPU *state,
+ ShaderData *sd);
/* attributes */
- static int find_attribute(KernelGlobals *kg,
+ static int find_attribute(const KernelGlobals *kg,
const ShaderData *sd,
uint id,
AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 23949f406c7..55afb892d36 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -18,11 +18,13 @@
#include "stdcycles.h"
shader node_principled_bsdf(string distribution = "Multiscatter GGX",
- string subsurface_method = "burley",
+ string subsurface_method = "random_walk",
color BaseColor = color(0.8, 0.8, 0.8),
float Subsurface = 0.0,
vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
color SubsurfaceColor = color(0.7, 0.1, 0.1),
+ float SubsurfaceIOR = 1.4,
+ float SubsurfaceAnisotropy = 0.0,
float Metallic = 0.0,
float Specular = 0.5,
float SpecularTint = 0.0,
@@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX",
if (diffuse_weight > 1e-5) {
if (Subsurface > 1e-5) {
color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
- if (subsurface_method == "burley") {
- BSDF = mixed_ss_base_color * bssrdf("principled",
- Normal,
- Subsurface * SubsurfaceRadius,
- SubsurfaceColor,
- "roughness",
- Roughness);
- }
- else {
- BSDF = mixed_ss_base_color * bssrdf("principled_random_walk",
- Normal,
- Subsurface * SubsurfaceRadius,
- mixed_ss_base_color,
- "roughness",
- Roughness);
- }
+
+ BSDF = mixed_ss_base_color * bssrdf(subsurface_method,
+ Normal,
+ Subsurface * SubsurfaceRadius,
+ mixed_ss_base_color,
+ "roughness",
+ Roughness,
+ "ior",
+ SubsurfaceIOR,
+ "anisotropy",
+ SubsurfaceAnisotropy);
}
else {
BSDF = BaseColor * principled_diffuse(Normal, Roughness);
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index b1e854150ab..f55e38c54ff 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -19,27 +19,12 @@
shader node_subsurface_scattering(color Color = 0.8,
float Scale = 1.0,
vector Radius = vector(0.1, 0.1, 0.1),
- float TextureBlur = 0.0,
- float Sharpness = 0.0,
- string falloff = "cubic",
+ float IOR = 1.4,
+ float Anisotropy = 0.0,
+ string method = "random_walk",
normal Normal = N,
output closure color BSSRDF = 0)
{
- if (falloff == "gaussian")
- BSSRDF = Color *
- bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
- else if (falloff == "cubic")
- BSSRDF = Color * bssrdf("cubic",
- Normal,
- Scale * Radius,
- Color,
- "texture_blur",
- TextureBlur,
- "sharpness",
- Sharpness);
- else if (falloff == "burley")
- BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
- else
- BSSRDF = Color *
- bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
+ BSSRDF = Color *
+ bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy);
}
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
deleted file mode 100644
index 437a5c9581b..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
-{
- int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
- int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
- int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
- int buffer_offset = (kernel_split_params.tile.offset + x +
- y * kernel_split_params.tile.stride) *
- kernel_data.film.pass_stride;
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
- int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
- if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
- buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
- float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
- if (sample_multiplier != 1.0f) {
- kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
- }
- }
- else {
- kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
deleted file mode 100644
index 93f41f7ced4..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
-{
- int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (pixel_index < kernel_split_params.tile.h &&
- kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
- kernel_data.integrator.adaptive_min_samples) {
- int y = kernel_split_params.tile.y + pixel_index;
- kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
deleted file mode 100644
index eca53d079ec..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
-{
- int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (pixel_index < kernel_split_params.tile.w &&
- kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
- kernel_data.integrator.adaptive_min_samples) {
- int x = kernel_split_params.tile.x + pixel_index;
- kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
- }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
deleted file mode 100644
index c8eb1ebd705..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
-{
- int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
- kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
- kernel_data.integrator.adaptive_min_samples) {
- int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
- int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
- int buffer_offset = (kernel_split_params.tile.offset + x +
- y * kernel_split_params.tile.stride) *
- kernel_data.film.pass_stride;
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
- kernel_do_adaptive_stopping(kg,
- buffer,
- kernel_split_params.tile.start_sample +
- kernel_split_params.tile.num_samples - 1);
- }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
deleted file mode 100644
index 45f5037d321..00000000000
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-/* sets up the various state needed to do an indirect loop */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg,
- int ray_index)
-{
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- /* save a copy of the state to restore later */
-# define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index];
-
- BRANCHED_STORE(path_state);
- BRANCHED_STORE(throughput);
- BRANCHED_STORE(ray);
- BRANCHED_STORE(isect);
- BRANCHED_STORE(ray_state);
-
- *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
- for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
- kernel_split_sd(branched_state_sd, ray_index)->closure[i] =
- kernel_split_sd(sd, ray_index)->closure[i];
- }
-
-# undef BRANCHED_STORE
-
- /* Set loop counters to initial position. */
- branched_state->next_closure = 0;
- branched_state->next_sample = 0;
-}
-
-/* ends an indirect loop and restores the previous state */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg,
- int ray_index)
-{
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- /* restore state */
-# define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name;
-
- BRANCHED_RESTORE(path_state);
- BRANCHED_RESTORE(throughput);
- BRANCHED_RESTORE(ray);
- BRANCHED_RESTORE(isect);
- BRANCHED_RESTORE(ray_state);
-
- *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
- for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
- kernel_split_sd(sd, ray_index)->closure[i] =
- kernel_split_sd(branched_state_sd, ray_index)->closure[i];
- }
-
-# undef BRANCHED_RESTORE
-
- /* leave indirect loop */
- REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
-}
-
-ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg,
- int ray_index)
-{
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
- int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- kernel_split_params.queue_index);
-
- if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
- return false;
- }
-
-# define SPLIT_DATA_ENTRY(type, name, num) \
- if (num) { \
- kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
- }
- SPLIT_DATA_ENTRIES_BRANCHED_SHARED
-# undef SPLIT_DATA_ENTRY
-
- *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
- for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
- kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
- }
-
- kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
- kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
- kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
-
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
-
- path_radiance_init(kg, inactive_L);
- path_radiance_copy_indirect(inactive_L, L);
-
- ray_state[inactive_ray] = RAY_REGENERATED;
- ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
- ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
-
- atomic_fetch_and_inc_uint32(
- (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count);
-
- return true;
-}
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
- KernelGlobals *kg,
- int ray_index,
- float num_samples_adjust,
- ShaderData *saved_sd,
- bool reset_path_state,
- bool wait_for_shared)
-{
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- ShaderData *sd = saved_sd;
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- float3 throughput = branched_state->throughput;
- ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-
- float sum_sample_weight = 0.0f;
-# ifdef __DENOISING_FEATURES__
- if (ps->denoising_feature_weight > 0.0f) {
- for (int i = 0; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- /* transparency is not handled here, but in outer loop */
- if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
- continue;
- }
-
- sum_sample_weight += sc->sample_weight;
- }
- }
- else {
- sum_sample_weight = 1.0f;
- }
-# endif /* __DENOISING_FEATURES__ */
-
- for (int i = branched_state->next_closure; i < sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
-
- if (!CLOSURE_IS_BSDF(sc->type))
- continue;
- /* transparency is not handled here, but in outer loop */
- if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
- continue;
-
- int num_samples;
-
- if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
- num_samples = kernel_data.integrator.diffuse_samples;
- else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
- num_samples = 1;
- else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
- num_samples = kernel_data.integrator.glossy_samples;
- else
- num_samples = kernel_data.integrator.transmission_samples;
-
- num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
- float num_samples_inv = num_samples_adjust / num_samples;
-
- for (int j = branched_state->next_sample; j < num_samples; j++) {
- if (reset_path_state) {
- *ps = branched_state->path_state;
- }
-
- ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
- ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
- *tp = throughput;
-
- ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
-
- if (!kernel_branched_path_surface_bounce(
- kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) {
- continue;
- }
-
- ps->rng_hash = branched_state->path_state.rng_hash;
-
- /* update state for next iteration */
- branched_state->next_closure = i;
- branched_state->next_sample = j + 1;
-
- /* start the indirect path */
- *tp *= num_samples_inv;
-
- if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
- continue;
- }
-
- return true;
- }
-
- branched_state->next_sample = 0;
- }
-
- branched_state->next_closure = sd->num_closure;
-
- if (wait_for_shared) {
- branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
- if (branched_state->waiting_on_shared_samples) {
- return true;
- }
- }
-
- return false;
-}
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
deleted file mode 100644
index b96feca582f..00000000000
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of rays that hit the background (sceneintersect
- * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
- * accumulated radiance in the output buffer. This kernel also takes care of
- * rays that have been determined to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel.
- * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
- * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queues when this kernel is called:
- * At entry,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
- * At exit,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- * RAY_REGENERATED rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- */
-ccl_device void kernel_buffer_update(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
-{
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (ray_index == 0) {
- /* We will empty this queue in this kernel. */
- kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- }
- char enqueue_flag = 0;
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
- if (ray_index != QUEUE_EMPTY_SLOT) {
- ccl_global char *ray_state = kernel_split_state.ray_state;
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- bool ray_was_updated = false;
-
- if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- ray_was_updated = true;
- uint sample = state->sample;
- uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
- /* accumulate result in output buffer */
- kernel_write_result(kg, buffer, sample, L);
-
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
- }
-
- if (kernel_data.film.cryptomatte_passes) {
- /* Make sure no thread is writing to the buffers. */
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
- uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
- ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
- kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
- }
- }
-
- if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
- /* We have completed current work; So get next work */
- ccl_global uint *work_pools = kernel_split_params.work_pools;
- uint total_work_size = kernel_split_params.total_work_size;
- uint work_index;
-
- if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
- /* If work is invalid, this means no more work is available and the thread may exit */
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
- }
-
- if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
- ccl_global WorkTile *tile = &kernel_split_params.tile;
- uint x, y, sample;
- get_work_pixel(tile, work_index, &x, &y, &sample);
-
- /* Store buffer offset for writing to passes. */
- uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
- kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
- /* Initialize random numbers and ray. */
- uint rng_hash;
- kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
-
- if (ray->t != 0.0f) {
- /* Initialize throughput, path radiance, Ray, PathState;
- * These rays proceed with path-iteration.
- */
- *throughput = make_float3(1.0f, 1.0f, 1.0f);
- path_radiance_init(kg, L);
- path_state_init(kg,
- AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
- state,
- rng_hash,
- sample,
- ray);
-#ifdef __SUBSURFACE__
- kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- enqueue_flag = 1;
- }
- else {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
- }
- }
- }
- }
-
- /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
- * These rays will be made active during next SceneIntersectkernel.
- */
- enqueue_ray_index_local(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
deleted file mode 100644
index 2f83a10316d..00000000000
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel Initializes structures needed in path-iteration kernels.
- *
- * Note on Queues:
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-
-#ifndef __KERNEL_CPU__
-ccl_device void kernel_data_init(
-#else
-void KERNEL_FUNCTION_FULL_NAME(data_init)(
-#endif
- KernelGlobals *kg,
- ccl_constant KernelData *data,
- ccl_global void *split_data_buffer,
- int num_elements,
- ccl_global char *ray_state,
-
-#ifdef __KERNEL_OPENCL__
- KERNEL_BUFFER_PARAMS,
-#endif
-
- int start_sample,
- int end_sample,
- int sx,
- int sy,
- int sw,
- int sh,
- int offset,
- int stride,
- ccl_global int *Queue_index, /* Tracks the number of elements in queues */
- int queuesize, /* size (capacity) of the queue */
- ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
- to fetch ray index */
- ccl_global unsigned int *work_pools, /* Work pool for each work group */
- unsigned int num_samples,
- ccl_global float *buffer)
-{
-#ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, data_init);
-#else
-
-# ifdef __KERNEL_OPENCL__
- kg->data = data;
-# endif
-
- kernel_split_params.tile.x = sx;
- kernel_split_params.tile.y = sy;
- kernel_split_params.tile.w = sw;
- kernel_split_params.tile.h = sh;
-
- kernel_split_params.tile.start_sample = start_sample;
- kernel_split_params.tile.num_samples = num_samples;
-
- kernel_split_params.tile.offset = offset;
- kernel_split_params.tile.stride = stride;
-
- kernel_split_params.tile.buffer = buffer;
-
- kernel_split_params.total_work_size = sw * sh * num_samples;
-
- kernel_split_params.work_pools = work_pools;
-
- kernel_split_params.queue_index = Queue_index;
- kernel_split_params.queue_size = queuesize;
- kernel_split_params.use_queues_flag = use_queues_flag;
-
- split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
-
-# ifdef __KERNEL_OPENCL__
- kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
- kernel_set_buffer_info(kg);
-# endif
-
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
- /* Initialize queue data and queue index. */
- if (thread_index < queuesize) {
- for (int i = 0; i < NUM_QUEUES; i++) {
- kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
- }
- }
-
- if (thread_index == 0) {
- for (int i = 0; i < NUM_QUEUES; i++) {
- Queue_index[i] = 0;
- }
-
- /* The scene-intersect kernel should not use the queues very first time.
- * since the queue would be empty.
- */
- *use_queues_flag = 0;
- }
-#endif /* KERENL_STUB */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
deleted file mode 100644
index 3be2b35812f..00000000000
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of direct lighting logic.
- * However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with direct lighting should be executed. Those rays for which
- * a shadow_blocked() function for direct-lighting must be executed, are
- * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * Note on Queues:
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
- * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
- * the corresponding shadow_blocked part, after direct lighting, the ray is
- * marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called:
- * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
- * kernel call.
- * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
- * shadow_blocked function must be executed, after this kernel call
- * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
-{
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- char enqueue_flag = 0;
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
-
- /* direct lighting */
-#ifdef __EMISSION__
- bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL));
-
-# ifdef __BRANCHED_PATH__
- if (flag && kernel_data.integrator.branched) {
- flag = false;
- enqueue_flag = 1;
- }
-# endif /* __BRANCHED_PATH__ */
-
-# ifdef __SHADOW_TRICKS__
- if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
- flag = false;
- enqueue_flag = 1;
- }
-# endif /* __SHADOW_TRICKS__ */
-
- if (flag) {
- /* Sample illumination from lights to find path contribution. */
- float light_u, light_v;
- path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
- float terminate = path_state_rng_light_termination(kg, state);
-
- LightSample ls;
- if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
- Ray light_ray;
- light_ray.time = sd->time;
-
- BsdfEval L_light;
- bool is_lamp;
- if (direct_emission(kg,
- sd,
- AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
- &ls,
- state,
- &light_ray,
- &L_light,
- &is_lamp,
- terminate)) {
- /* Write intermediate data to global memory to access from
- * the next kernel.
- */
- kernel_split_state.light_ray[ray_index] = light_ray;
- kernel_split_state.bsdf_eval[ray_index] = L_light;
- kernel_split_state.is_lamp[ray_index] = is_lamp;
- /* Mark ray state for next shadow kernel. */
- enqueue_flag = 1;
- }
- }
- }
-#endif /* __EMISSION__ */
- }
-
-#ifdef __EMISSION__
- /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_SHADOW_RAY_CAST_DL_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-#endif
-
-#ifdef __BRANCHED_PATH__
- /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
- * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
- */
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- enqueue_ray_index_local(
- ray_index,
- QUEUE_LIGHT_INDIRECT_ITER,
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
deleted file mode 100644
index 1775e870f07..00000000000
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
-
-ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg,
- int ray_index)
-{
- kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
- ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg,
- int ray_index)
-{
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
- /* GPU: no decoupled ray marching, scatter probabilistically. */
- int num_samples = kernel_data.integrator.volume_samples;
- float num_samples_inv = 1.0f / num_samples;
-
- Ray volume_ray = branched_state->ray;
- volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ?
- branched_state->isect.t :
- FLT_MAX;
-
- float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
-
- for (int j = branched_state->next_sample; j < num_samples; j++) {
- ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
- *ps = branched_state->path_state;
-
- ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
- *pray = branched_state->ray;
-
- ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
- *tp = branched_state->throughput * num_samples_inv;
-
- /* branch RNG state */
- path_state_branch(ps, j, num_samples);
-
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, ps, sd, &volume_ray, L, tp, step_size);
-
-# ifdef __VOLUME_SCATTER__
- if (result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
-
- /* indirect light bounce */
- if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
- continue;
- }
-
- /* start the indirect path */
- branched_state->next_closure = 0;
- branched_state->next_sample = j + 1;
-
- /* Attempting to share too many samples is slow for volumes as it causes us to
- * loop here more and have many calls to kernel_volume_integrate which evaluates
- * shaders. The many expensive shader evaluations cause the work load to become
- * unbalanced and many threads to become idle in this kernel. Limiting the
- * number of shared samples here helps quite a lot.
- */
- if (branched_state->shared_sample_count < 2) {
- if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
- continue;
- }
- }
-
- return true;
- }
-# endif
- }
-
- branched_state->next_sample = num_samples;
-
- branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
- if (branched_state->waiting_on_shared_samples) {
- return true;
- }
-
- kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
- /* todo: avoid this calculation using decoupled ray marching */
- float3 throughput = kernel_split_state.throughput[ray_index];
- kernel_volume_shadow(
- kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
- kernel_split_state.throughput[ray_index] = throughput;
-
- return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __VOLUME__ */
-
-ccl_device void kernel_do_volume(KernelGlobals *kg)
-{
-#ifdef __VOLUME__
- /* We will empty this queue in this kernel. */
- if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
- kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-# ifdef __BRANCHED_PATH__
- kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
-# endif /* __BRANCHED_PATH__ */
- }
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
- if (*kernel_split_params.use_queues_flag) {
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
- }
-
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
- bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-
- /* Sanitize volume stack. */
- if (!hit) {
- kernel_volume_clean_stack(kg, state->volume_stack);
- }
- /* volume attenuation, emission, scatter */
- if (state->volume_stack[0].shader != SHADER_NONE) {
- Ray volume_ray = *ray;
- volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-# ifdef __BRANCHED_PATH__
- if (!kernel_data.integrator.branched ||
- IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-# endif /* __BRANCHED_PATH__ */
- float step_size = volume_stack_step_size(kg, state->volume_stack);
-
- {
- /* integrate along volume segment with distance sampling */
- VolumeIntegrateResult result = kernel_volume_integrate(
- kg, state, sd, &volume_ray, L, throughput, step_size);
-
-# ifdef __VOLUME_SCATTER__
- if (result == VOLUME_PATH_SCATTERED) {
- /* direct lighting */
- kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
- /* indirect light bounce */
- if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- else {
- kernel_split_path_end(kg, ray_index);
- }
- }
-# endif /* __VOLUME_SCATTER__ */
- }
-
-# ifdef __BRANCHED_PATH__
- }
- else {
- kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
-
- if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- }
-# endif /* __BRANCHED_PATH__ */
- }
- }
-
-# ifdef __BRANCHED_PATH__
- /* iter loop */
- ray_index = get_ray_index(kg,
- ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
- QUEUE_VOLUME_INDIRECT_ITER,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
- if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
- path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
- if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- }
-# endif /* __BRANCHED_PATH__ */
-
-#endif /* __VOLUME__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
deleted file mode 100644
index 745313f89f1..00000000000
--- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
-{
-#ifdef __BRANCHED_PATH__
- /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
- char enqueue_flag = 0;
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
- enqueue_flag = 1;
- }
-
- enqueue_ray_index_local(ray_index,
- QUEUE_INACTIVE_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
deleted file mode 100644
index 61722840b0b..00000000000
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of the logic to process "material of type holdout",
- * indirect primitive emission, bsdf blurring, probabilistic path termination
- * and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with AO should be executed. Those rays for which a
- * shadow_blocked() function for AO must be executed are marked with flag
- * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * Note on Queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- * RAY_REGENERATED rays
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_TO_REGENERATE rays.
- * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- * RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
- * flag RAY_SHADOW_RAY_CAST_AO
- */
-
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
- KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals)
-{
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- locals->queue_atomics_bg = 0;
- locals->queue_atomics_ao = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-#ifdef __AO__
- char enqueue_flag = 0;
-#endif
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (ray_index != QUEUE_EMPTY_SLOT) {
- ccl_global PathState *state = 0x0;
- float3 throughput;
-
- ccl_global char *ray_state = kernel_split_state.ray_state;
- ShaderData *sd = kernel_split_sd(sd, ray_index);
-
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
- throughput = kernel_split_state.throughput[ray_index];
- state = &kernel_split_state.path_state[ray_index];
-
- if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) {
- kernel_split_path_end(kg, ray_index);
- }
- }
-
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- /* Path termination. this is a strange place to put the termination, it's
- * mainly due to the mixed in MIS that we use. gives too many unneeded
- * shader evaluations, only need emission if we are going to terminate.
- */
- float probability = path_state_continuation_probability(kg, state, throughput);
-
- if (probability == 0.0f) {
- kernel_split_path_end(kg, ray_index);
- }
- else if (probability < 1.0f) {
- float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
- if (terminate >= probability) {
- kernel_split_path_end(kg, ray_index);
- }
- else {
- kernel_split_state.throughput[ray_index] = throughput / probability;
- }
- }
-
-#ifdef __DENOISING_FEATURES__
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- kernel_update_denoising_features(kg, sd, state, L);
- }
-#endif
- }
-
-#ifdef __AO__
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- /* ambient occlusion */
- if (kernel_data.integrator.use_ambient_occlusion) {
- enqueue_flag = 1;
- }
- }
-#endif /* __AO__ */
- }
-
-#ifdef __AO__
- /* Enqueue to-shadow-ray-cast rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_SHADOW_RAY_CAST_AO_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- &locals->queue_atomics_ao,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
deleted file mode 100644
index 6d500650cc0..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_background(KernelGlobals *kg)
-{
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- int ray_index;
-
- if (kernel_data.integrator.ao_bounces != INT_MAX) {
- ray_index = get_ray_index(kg,
- thread_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (ray_index != QUEUE_EMPTY_SLOT) {
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- if (path_state_ao_bounce(kg, state)) {
- kernel_split_path_end(kg, ray_index);
- }
- }
- }
- }
-
- ray_index = get_ray_index(kg,
- thread_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-
- if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- float3 throughput = kernel_split_state.throughput[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
- kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
- kernel_split_path_end(kg, ray_index);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
deleted file mode 100644
index 3f48f8d6f56..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
-{
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (thread_index == 0) {
- /* We will empty both queues in this kernel. */
- kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- }
-
- int ray_index;
- get_ray_index(kg,
- thread_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
- ray_index = get_ray_index(kg,
- thread_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
-#ifdef __SUBSURFACE__
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-
- ccl_global char *ray_state = kernel_split_state.ray_state;
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-
- if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
- /* Trace indirect subsurface rays by restarting the loop. this uses less
- * stack memory than invoking kernel_path_indirect.
- */
- if (ss_indirect->num_rays) {
- kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput);
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- }
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
deleted file mode 100644
index 7ecb099208d..00000000000
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- */
-ccl_device void kernel_lamp_emission(KernelGlobals *kg)
-{
-#ifndef __VOLUME__
- /* We will empty this queue in this kernel. */
- if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
- kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- }
-#endif
- /* Fetch use_queues_flag. */
- char local_use_queues_flag = *kernel_split_params.use_queues_flag;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (local_use_queues_flag) {
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
-#ifndef __VOLUME__
- 1
-#else
- 0
-#endif
- );
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
- }
-
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
- float3 throughput = kernel_split_state.throughput[ray_index];
- Ray ray = kernel_split_state.ray[ray_index];
- ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
-
- kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
deleted file mode 100644
index 320f6a414bf..00000000000
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*This kernel takes care of setting up ray for the next iteration of
- * path-iteration and accumulating radiance corresponding to AO and
- * direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed
- * to RAY_UPDATE_BUFFER.
- *
- * Note on queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFF state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- * RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
- */
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
-{
- kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
- ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
-}
-
-ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
-{
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-
-# ifdef __VOLUME__
- if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-# endif
- /* continue in case of transparency */
- *throughput *= shader_bsdf_transparency(kg, sd);
-
- if (is_zero(*throughput)) {
- kernel_split_path_end(kg, ray_index);
- return;
- }
-
- /* Update Path State */
- path_state_next(kg, state, LABEL_TRANSPARENT);
-# ifdef __VOLUME__
- }
- else {
- if (!path_state_volume_next(kg, state)) {
- kernel_split_path_end(kg, ray_index);
- return;
- }
- }
-# endif
-
- ray->P = ray_offset(sd->P, -sd->Ng);
- ray->t -= sd->ray_length; /* clipping works through transparent */
-
-# ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
- ray->dD.dx = -sd->dI.dx;
- ray->dD.dy = -sd->dI.dy;
-# endif /* __RAY_DIFFERENTIALS__ */
-
-# ifdef __VOLUME__
- /* enter/exit volume */
- kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-# endif /* __VOLUME__ */
-}
-#endif /* __BRANCHED_PATH__ */
-
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
-{
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
- /* If we are here, then it means that scene-intersect kernel
- * has already been executed at least once. From the next time,
- * scene-intersect kernel may operate on queues to fetch ray index
- */
- *kernel_split_params.use_queues_flag = 1;
-
- /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
- * previous kernel.
- */
- kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
- kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
- }
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __VOLUME__
- /* Reactivate only volume rays here, most surface work was skipped. */
- if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
- }
-#endif
-
- bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
- if (active) {
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
- if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
- /* Compute direct lighting and next bounce. */
- if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
- kernel_split_path_end(kg, ray_index);
- }
-#ifdef __BRANCHED_PATH__
- }
- else if (sd->flag & SD_HAS_ONLY_VOLUME) {
- kernel_split_branched_transparent_bounce(kg, ray_index);
- }
- else {
- kernel_split_branched_indirect_light_init(kg, ray_index);
-
- if (kernel_split_branched_path_surface_indirect_light_iter(
- kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- else {
- kernel_split_branched_path_indirect_loop_end(kg, ray_index);
- kernel_split_branched_transparent_bounce(kg, ray_index);
- }
- }
-#endif /* __BRANCHED_PATH__ */
- }
-
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
-#ifdef __BRANCHED_PATH__
- /* iter loop */
- if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
- kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
- }
-
- ray_index = get_ray_index(kg,
- ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
- QUEUE_LIGHT_INDIRECT_ITER,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
- if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
- path_radiance_sum_indirect(L);
- path_radiance_reset_indirect(L);
-
- if (kernel_split_branched_path_surface_indirect_light_iter(
- kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- else {
- kernel_split_branched_path_indirect_loop_end(kg, ray_index);
- kernel_split_branched_transparent_bounce(kg, ray_index);
- }
- }
-
-# ifdef __VOLUME__
- /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- enqueue_ray_index_local(
- ray_index,
- QUEUE_VOLUME_INDIRECT_ITER,
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
-# endif /* __VOLUME__ */
-
-# ifdef __SUBSURFACE__
- /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- enqueue_ray_index_local(
- ray_index,
- QUEUE_SUBSURFACE_INDIRECT_ITER,
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-# endif /* __SUBSURFACE__ */
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
deleted file mode 100644
index c686f46a0cd..00000000000
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- */
-ccl_device void kernel_path_init(KernelGlobals *kg)
-{
- int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
- /* This is the first assignment to ray_state;
- * So we don't use ASSIGN_RAY_STATE macro.
- */
- kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
-
- /* Get work. */
- ccl_global uint *work_pools = kernel_split_params.work_pools;
- uint total_work_size = kernel_split_params.total_work_size;
- uint work_index;
-
- if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
- /* No more work, mark ray as inactive */
- kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
-
- return;
- }
-
- ccl_global WorkTile *tile = &kernel_split_params.tile;
- uint x, y, sample;
- get_work_pixel(tile, work_index, &x, &y, &sample);
-
- /* Store buffer offset for writing to passes. */
- uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
- kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
- /* Initialize random numbers and ray. */
- uint rng_hash;
- kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]);
-
- if (kernel_split_state.ray[ray_index].t != 0.0f) {
- /* Initialize throughput, path radiance, Ray, PathState;
- * These rays proceed with path-iteration.
- */
- kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
- path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
- path_state_init(kg,
- AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
- &kernel_split_state.path_state[ray_index],
- rng_hash,
- sample,
- &kernel_split_state.ray[ray_index]);
-#ifdef __SUBSURFACE__
- kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
- }
- else {
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
deleted file mode 100644
index 2db87f7a671..00000000000
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel enqueues rays of different ray state into their
- * appropriate queues:
- *
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel are enqueued in
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in pat
- * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queue during other times this kernel is called:
- * At entry,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
- * and RAY_UPDATE_BUFFER rays.
- * At exit,
- * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals)
-{
- /* We have only 2 cases (Hit/Not-Hit) */
- int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
- if (lidx == 0) {
- locals->queue_atomics[0] = 0;
- locals->queue_atomics[1] = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int queue_number = -1;
-
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
- queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- }
- else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
- queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
- }
-
- unsigned int my_lqidx;
- if (queue_number != -1) {
- my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- if (lidx == 0) {
- locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(
- QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index);
- locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- locals->queue_atomics,
- kernel_split_params.queue_index);
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- unsigned int my_gqidx;
- if (queue_number != -1) {
- my_gqidx = get_global_queue_index(
- queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics);
- kernel_split_state.queue_data[my_gqidx] = ray_index;
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
deleted file mode 100644
index 9ac95aafd2f..00000000000
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes
- * their ray state to RAY_HIT_BACKGROUND.
- */
-ccl_device void kernel_scene_intersect(KernelGlobals *kg)
-{
- /* Fetch use_queues_flag */
- char local_use_queues_flag = *kernel_split_params.use_queues_flag;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (local_use_queues_flag) {
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
- }
-
- /* All regenerated rays become active here */
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-#ifdef __BRANCHED_PATH__
- if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
- kernel_split_path_end(kg, ray_index);
- }
- else
-#endif /* __BRANCHED_PATH__ */
- {
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
- }
- }
-
- if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
- return;
- }
-
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- Ray ray = kernel_split_state.ray[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
- Intersection isect;
- const int last_object = state->bounce > 0 ?
- intersection_get_object(kg, &kernel_split_state.isect[ray_index]) :
- OBJECT_NONE;
- bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object);
- kernel_split_state.isect[ray_index] = isect;
-
- if (!hit) {
- /* Change the state of rays that hit the background;
- * These rays undergo special processing in the
- * background_bufferUpdate kernel.
- */
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
deleted file mode 100644
index c760a2b2049..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel evaluates ShaderData structure from the values computed
- * by the previous kernels.
- */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
-{
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- /* Sorting on cuda split is not implemented */
-#ifdef __KERNEL_CUDA__
- int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-#else
- int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
-#endif
- if (ray_index >= queue_index) {
- return;
- }
- ray_index = get_ray_index(kg,
- ray_index,
-#ifdef __KERNEL_CUDA__
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-#else
- QUEUE_SHADER_SORTED_RAYS,
-#endif
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
-
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-
- ccl_global char *ray_state = kernel_split_state.ray_state;
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
- shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
-#ifdef __BRANCHED_PATH__
- if (kernel_data.integrator.branched) {
- shader_merge_closures(kernel_split_sd(sd, ray_index));
- }
- else
-#endif
- {
- shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
deleted file mode 100644
index 551836d1653..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_setup.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel sets up the ShaderData structure from the values computed
- * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- */
-ccl_device void kernel_shader_setup(KernelGlobals *kg,
- ccl_local_param unsigned int *local_queue_atomics)
-{
- /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
- if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
- *local_queue_atomics = 0;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
- if (ray_index < queue_index) {
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 0);
- }
- else {
- ray_index = QUEUE_EMPTY_SLOT;
- }
-
- char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 :
- 0;
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- kernel_split_params.queue_size,
- local_queue_atomics,
- kernel_split_state.queue_data,
- kernel_split_params.queue_index);
-
- /* Continue on with shader evaluation. */
- if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
- Intersection isect = kernel_split_state.isect[ray_index];
- Ray ray = kernel_split_state.ray[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
-
- shader_setup_from_ray(kg, sd, &isect, &ray);
-
-#ifdef __VOLUME__
- if (sd->flag & SD_HAS_ONLY_VOLUME) {
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
- }
-#endif
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
deleted file mode 100644
index 95d33a42014..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals)
-{
-#ifndef __KERNEL_CUDA__
- int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
- if (tid == 0) {
- kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
- }
-
- uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE;
- if (offset >= qsize) {
- return;
- }
-
- int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
- uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
- uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
- ccl_local uint *local_value = &locals->local_value[0];
- ccl_local ushort *local_index = &locals->local_index[0];
-
- /* copy to local memory */
- for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
- uint idx = offset + i + lid;
- uint add = input + idx;
- uint value = (~0);
- if (idx < qsize) {
- int ray_index = kernel_split_state.queue_data[add];
- bool valid = (ray_index != QUEUE_EMPTY_SLOT) &&
- IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
- if (valid) {
- value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
- }
- }
- local_value[i + lid] = value;
- local_index[i + lid] = i + lid;
- }
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- /* skip sorting for cpu split kernel */
-# ifdef __KERNEL_OPENCL__
-
- /* bitonic sort */
- for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
- for (uint inc = length; inc > 0; inc >>= 1) {
- for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
- uint i = lid + ii;
- bool direction = ((i & (length << 1)) != 0);
- uint j = i ^ inc;
- ushort ioff = local_index[i];
- ushort joff = local_index[j];
- uint iKey = local_value[ioff];
- uint jKey = local_value[joff];
- bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
- bool swap = smaller ^ (j < i) ^ direction;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- local_index[i] = (swap) ? joff : ioff;
- local_index[j] = (swap) ? ioff : joff;
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
- }
- }
- }
-# endif /* __KERNEL_OPENCL__ */
-
- /* copy to destination */
- for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
- uint idx = offset + i + lid;
- uint lidx = local_index[i + lid];
- uint outi = output + idx;
- uint ini = input + offset + lidx;
- uint value = local_value[lidx];
- if (idx < qsize) {
- kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT :
- kernel_split_state.queue_data[ini];
- }
- }
-#endif /* __KERNEL_CUDA__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
deleted file mode 100644
index 5d772fc597b..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for AO. */
-ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
-{
- unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = QUEUE_EMPTY_SLOT;
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (thread_index < ao_queue_length) {
- ray_index = get_ray_index(kg,
- thread_index,
- QUEUE_SHADOW_RAY_CAST_AO_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
- }
-
- if (ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- float3 throughput = kernel_split_state.throughput[ray_index];
-
-#ifdef __BRANCHED_PATH__
- if (!kernel_data.integrator.branched ||
- IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
- kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
-#ifdef __BRANCHED_PATH__
- }
- else {
- kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
- }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
deleted file mode 100644
index 5e46d300bca..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for direct visible light. */
-ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
-{
- unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
- ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
- int ray_index = QUEUE_EMPTY_SLOT;
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (thread_index < dl_queue_length) {
- ray_index = get_ray_index(kg,
- thread_index,
- QUEUE_SHADOW_RAY_CAST_DL_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
- }
-
-#ifdef __BRANCHED_PATH__
- /* TODO(mai): move this somewhere else? */
- if (thread_index == 0) {
- /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
- kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
- }
-#endif /* __BRANCHED_PATH__ */
-
- if (ray_index == QUEUE_EMPTY_SLOT)
- return;
-
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- Ray ray = kernel_split_state.light_ray[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- float3 throughput = kernel_split_state.throughput[ray_index];
-
- BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
- bool is_lamp = kernel_split_state.is_lamp[ray_index];
-
-#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
- bool use_branched = false;
- int all = 0;
-
- if (state->flag & PATH_RAY_SHADOW_CATCHER) {
- use_branched = true;
- all = 1;
- }
-# if defined(__BRANCHED_PATH__)
- else if (kernel_data.integrator.branched) {
- use_branched = true;
-
- if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- all = (kernel_data.integrator.sample_all_lights_indirect);
- }
- else {
- all = (kernel_data.integrator.sample_all_lights_direct);
- }
- }
-# endif /* __BRANCHED_PATH__ */
-
- if (use_branched) {
- kernel_branched_path_surface_connect_light(
- kg, sd, emission_sd, state, throughput, 1.0f, L, all);
- }
- else
-#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
- {
- /* trace shadow ray */
- float3 shadow;
-
- if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
- /* accumulate */
- path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
- }
- else {
- path_radiance_accum_total_light(L, state, throughput, &L_light);
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
deleted file mode 100644
index 5114f2b03e5..00000000000
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_H__
-#define __KERNEL_SPLIT_H__
-
-// clang-format off
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-
-#ifdef __OSL__
-# include "kernel/osl/osl_shader.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-# include "kernel/kernels/opencl/kernel_opencl_image.h"
-#endif
-#ifdef __KERNEL_CUDA__
-# include "kernel/kernels/cuda/kernel_cuda_image.h"
-#endif
-#ifdef __KERNEL_CPU__
-# include "kernel/kernels/cpu/kernel_cpu_image.h"
-#endif
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_path.h"
-#ifdef __BRANCHED_PATH__
-# include "kernel/kernel_path_branched.h"
-#endif
-
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
-
-#ifdef __BRANCHED_PATH__
-# include "kernel/split/kernel_branched.h"
-#endif
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
-{
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __BRANCHED_PATH__
-# ifdef __SUBSURFACE__
- ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
- if (ss_indirect->num_rays) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- }
- else
-# endif /* __SUBSURFACE__ */
- if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
- int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
-
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
-
- path_radiance_sum_indirect(L);
- path_radiance_accum_sample(orig_ray_L, L);
-
- atomic_fetch_and_dec_uint32(
- (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
- }
- else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
- }
- else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
- }
- else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
- }
- else {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- }
-#else
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
deleted file mode 100644
index decc537b39b..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_H__
-#define __KERNEL_SPLIT_DATA_H__
-
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "kernel/kernel_globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
-{
- (void)kg; /* Unused on CPU. */
-
- uint64_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16)
- size = size SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
- uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
- size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
- size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
- return size;
-}
-
-ccl_device_inline void split_data_init(KernelGlobals *kg,
- ccl_global SplitData *split_data,
- size_t num_elements,
- ccl_global void *data,
- ccl_global char *ray_state)
-{
- (void)kg; /* Unused on CPU. */
-
- ccl_global char *p = (ccl_global char *)data;
-
-#define SPLIT_DATA_ENTRY(type, name, num) \
- split_data->name = (type *)p; \
- p += align_up(num_elements * num * sizeof(type), 16);
- SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
- uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
- split_data->_branched_state_sd = (ShaderData *)p;
- p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
- split_data->_sd = (ShaderData *)p;
- p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
- split_data->ray_state = ray_state;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
deleted file mode 100644
index 06bdce9947d..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
-#define __KERNEL_SPLIT_DATA_TYPES_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each
- * kernel */
-
-typedef struct SplitParams {
- WorkTile tile;
- uint total_work_size;
-
- ccl_global unsigned int *work_pools;
-
- ccl_global int *queue_index;
- int queue_size;
- ccl_global char *use_queues_flag;
-
- /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
- int dummy_sd_flag;
-} SplitParams;
-
-/* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
-
-/* SPLIT_DATA_ENTRY(type, name, num) */
-
-#ifdef __BRANCHED_PATH__
-
-typedef ccl_global struct SplitBranchedState {
- /* various state that must be kept and restored after an indirect loop */
- PathState path_state;
- float3 throughput;
- Ray ray;
-
- Intersection isect;
-
- char ray_state;
-
- /* indirect loop state */
- int next_closure;
- int next_sample;
-
-# ifdef __SUBSURFACE__
- int ss_next_closure;
- int ss_next_sample;
- int next_hit;
- int num_hits;
-
- uint lcg_state;
- LocalIntersection ss_isect;
-# endif /* __SUBSURFACE__ */
-
- int shared_sample_count; /* number of branched samples shared with other threads */
- int original_ray; /* index of original ray when sharing branched samples */
- bool waiting_on_shared_samples;
-} SplitBranchedState;
-
-# define SPLIT_DATA_BRANCHED_ENTRIES \
- SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \
- SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
-#else
-# define SPLIT_DATA_BRANCHED_ENTRIES
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SUBSURFACE__
-# define SPLIT_DATA_SUBSURFACE_ENTRIES \
- SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
-#else
-# define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
-
-#ifdef __VOLUME__
-# define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
-#else
-# define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
-
-#define SPLIT_DATA_ENTRIES \
- SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
- SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
- SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
- SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
- SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
- SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
- SPLIT_DATA_ENTRY( \
- ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \
- SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
- SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
- SPLIT_DATA_SUBSURFACE_ENTRIES \
- SPLIT_DATA_VOLUME_ENTRIES \
- SPLIT_DATA_BRANCHED_ENTRIES \
- SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* Entries to be copied to inactive rays when sharing branched samples
- * (TODO: which are actually needed?) */
-#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
- SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
- SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
- SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
- SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
- SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
- SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
- SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
- SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
- SPLIT_DATA_SUBSURFACE_ENTRIES \
- SPLIT_DATA_VOLUME_ENTRIES \
- SPLIT_DATA_BRANCHED_ENTRIES \
- SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* struct that holds pointers to data in the shared state buffer */
-typedef struct SplitData {
-#define SPLIT_DATA_ENTRY(type, name, num) type *name;
- SPLIT_DATA_ENTRIES
-#undef SPLIT_DATA_ENTRY
-
- /* this is actually in a separate buffer from the rest of the split state data (so it can be read
- * back from the host easily) but is still used the same as the other data so we have it here in
- * this struct as well
- */
- ccl_global char *ray_state;
-} SplitData;
-
-#ifndef __KERNEL_CUDA__
-# define kernel_split_state (kg->split_data)
-# define kernel_split_params (kg->split_param_data)
-#else
-__device__ SplitData __split_data;
-# define kernel_split_state (__split_data)
-__device__ SplitParams __split_param_data;
-# define kernel_split_params (__split_param_data)
-#endif /* __KERNEL_CUDA__ */
-
-#define kernel_split_sd(sd, ray_index) \
- ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \
- (sizeof(ShaderData) + \
- sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \
- (ray_index)))
-
-/* Local storage for queue_enqueue kernel. */
-typedef struct QueueEnqueueLocals {
- uint queue_atomics[2];
-} QueueEnqueueLocals;
-
-/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
-typedef struct BackgroundAOLocals {
- uint queue_atomics_bg;
- uint queue_atomics_ao;
-} BackgroundAOLocals;
-
-typedef struct ShaderSortLocals {
- uint local_value[SHADER_SORT_BLOCK_SIZE];
- ushort local_index[SHADER_SORT_BLOCK_SIZE];
-} ShaderSortLocals;
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
deleted file mode 100644
index ba06ae3bc53..00000000000
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-
-ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg,
- int ray_index)
-{
- kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- branched_state->ss_next_closure = 0;
- branched_state->ss_next_sample = 0;
-
- branched_state->num_hits = 0;
- branched_state->next_hit = 0;
-
- ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(
- KernelGlobals *kg, int ray_index)
-{
- SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
- ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
- for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if (!CLOSURE_IS_BSSRDF(sc->type))
- continue;
-
- /* Closure memory will be overwritten, so read required variables now. */
- Bssrdf *bssrdf = (Bssrdf *)sc;
- ClosureType bssrdf_type = sc->type;
- float bssrdf_roughness = bssrdf->roughness;
-
- /* set up random number generator */
- if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
- branched_state->next_closure == 0 && branched_state->next_sample == 0) {
- branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
- 0x68bc21eb);
- }
- int num_samples = kernel_data.integrator.subsurface_samples * 3;
- float num_samples_inv = 1.0f / num_samples;
- uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
- /* do subsurface scatter step with copy of shader data, this will
- * replace the BSSRDF with a diffuse BSDF closure */
- for (int j = branched_state->ss_next_sample; j < num_samples; j++) {
- ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
- *hit_state = branched_state->path_state;
- hit_state->rng_hash = bssrdf_rng_hash;
- path_state_branch(hit_state, j, num_samples);
-
- ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
- float bssrdf_u, bssrdf_v;
- path_branched_rng_2D(
- kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
- /* intersection is expensive so avoid doing multiple times for the same input */
- if (branched_state->next_hit == 0 && branched_state->next_closure == 0 &&
- branched_state->next_sample == 0) {
- uint lcg_state = branched_state->lcg_state;
- LocalIntersection ss_isect_private;
-
- branched_state->num_hits = subsurface_scatter_multi_intersect(
- kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
- branched_state->lcg_state = lcg_state;
- *ss_isect = ss_isect_private;
- }
-
- hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-# ifdef __VOLUME__
- Ray volume_ray = branched_state->ray;
- bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
- sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-# endif /* __VOLUME__ */
-
- /* compute lighting with the BSDF closure */
- for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
- ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
- *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
- * important as the indirect path will write into bssrdf_sd */
-
- LocalIntersection ss_isect_private = *ss_isect;
- subsurface_scatter_multi_setup(
- kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness);
- *ss_isect = ss_isect_private;
-
-# ifdef __VOLUME__
- if (need_update_volume_stack) {
- /* Setup ray from previous surface point to the new one. */
- float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
- volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
- for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
- hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
- }
-
- kernel_volume_stack_update_for_subsurface(
- kg, emission_sd, &volume_ray, hit_state->volume_stack);
- }
-# endif /* __VOLUME__ */
-
-# ifdef __EMISSION__
- if (branched_state->next_closure == 0 && branched_state->next_sample == 0) {
- /* direct light */
- if (kernel_data.integrator.use_direct_light) {
- int all = (kernel_data.integrator.sample_all_lights_direct) ||
- (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
- kernel_branched_path_surface_connect_light(kg,
- bssrdf_sd,
- emission_sd,
- hit_state,
- branched_state->throughput,
- num_samples_inv,
- L,
- all);
- }
- }
-# endif /* __EMISSION__ */
-
- /* indirect light */
- if (kernel_split_branched_path_surface_indirect_light_iter(
- kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) {
- branched_state->ss_next_closure = i;
- branched_state->ss_next_sample = j;
- branched_state->next_hit = hit;
-
- return true;
- }
-
- branched_state->next_closure = 0;
- }
-
- branched_state->next_hit = 0;
- }
-
- branched_state->ss_next_sample = 0;
- }
-
- branched_state->ss_next_closure = sd->num_closure;
-
- branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
- if (branched_state->waiting_on_shared_samples) {
- return true;
- }
-
- kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
- return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
-
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
-{
- int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- if (thread_index == 0) {
- /* We will empty both queues in this kernel. */
- kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- }
-
- int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
- ray_index = get_ray_index(kg,
- ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
- get_ray_index(kg,
- thread_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
-#ifdef __SUBSURFACE__
- ccl_global char *ray_state = kernel_split_state.ray_state;
-
- if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
- ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
- ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
- ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
- ShaderData *sd = kernel_split_sd(sd, ray_index);
- ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
- if (sd->flag & SD_BSSRDF) {
-
-# ifdef __BRANCHED_PATH__
- if (!kernel_data.integrator.branched ||
- IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-# endif
- if (kernel_path_subsurface_scatter(
- kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) {
- kernel_split_path_end(kg, ray_index);
- }
-# ifdef __BRANCHED_PATH__
- }
- else {
- kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
-
- if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- }
-# endif
- }
- }
-
-# ifdef __BRANCHED_PATH__
- if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
- kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
- }
-
- /* iter loop */
- ray_index = get_ray_index(kg,
- ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
- QUEUE_SUBSURFACE_INDIRECT_ITER,
- kernel_split_state.queue_data,
- kernel_split_params.queue_size,
- 1);
-
- if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
- /* for render passes, sum and reset indirect light pass variables
- * for the next samples */
- path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
- path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
- if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
- }
- }
-# endif /* __BRANCHED_PATH__ */
-
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 000da1fa615..4aee1ef11b3 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a)
{
kernel_assert(a + 2 < SVM_STACK_SIZE);
- return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]);
+ float *stack_a = stack + a;
+ return make_float3(stack_a[0], stack_a[1], stack_a[2]);
}
ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
{
kernel_assert(a + 2 < SVM_STACK_SIZE);
- stack[a + 0] = f.x;
- stack[a + 1] = f.y;
- stack[a + 2] = f.z;
+ float *stack_a = stack + a;
+ stack_a[0] = f.x;
+ stack_a[1] = f.y;
+ stack_a[2] = f.z;
}
ccl_device_inline float stack_load_float(float *stack, uint a)
@@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a)
/* Reading Nodes */
-ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset)
{
uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
(*offset)++;
return node;
}
-ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset)
{
uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
float4 f = make_float4(__uint_as_float(node.x),
@@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
return f;
}
-ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset)
{
uint4 node = kernel_tex_fetch(__svm_nodes, offset);
return make_float4(__uint_as_float(node.x),
@@ -217,26 +219,11 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN
/* Main Interpreter Loop */
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ccl_global float *buffer,
- ShaderType type,
- int path_flag)
-{
- optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
-}
-extern "C" __device__ void __direct_callable__svm_eval_nodes(
-#else
-ccl_device_noinline void svm_eval_nodes(
-#endif
- KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- ccl_global float *buffer,
- ShaderType type,
- int path_flag)
+template<uint node_feature_mask, ShaderType type>
+ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *sd,
+ ccl_global float *render_buffer,
+ int path_flag)
{
float stack[SVM_STACK_SIZE];
int offset = sd->shader & SHADER_MASK;
@@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes(
switch (node.x) {
case NODE_END:
return;
-#if NODES_GROUP(NODE_GROUP_LEVEL_0)
case NODE_SHADER_JUMP: {
if (type == SHADER_TYPE_SURFACE)
offset = node.y;
@@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes(
break;
}
case NODE_CLOSURE_BSDF:
- svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
+ offset = svm_node_closure_bsdf<node_feature_mask, type>(
+ kg, sd, stack, node, path_flag, offset);
break;
case NODE_CLOSURE_EMISSION:
- svm_node_closure_emission(sd, stack, node);
+ if (KERNEL_NODES_FEATURE(EMISSION)) {
+ svm_node_closure_emission(sd, stack, node);
+ }
break;
case NODE_CLOSURE_BACKGROUND:
- svm_node_closure_background(sd, stack, node);
+ if (KERNEL_NODES_FEATURE(EMISSION)) {
+ svm_node_closure_background(sd, stack, node);
+ }
break;
case NODE_CLOSURE_SET_WEIGHT:
svm_node_closure_set_weight(sd, node.y, node.z, node.w);
@@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_closure_weight(sd, stack, node.y);
break;
case NODE_EMISSION_WEIGHT:
- svm_node_emission_weight(kg, sd, stack, node);
+ if (KERNEL_NODES_FEATURE(EMISSION)) {
+ svm_node_emission_weight(kg, sd, stack, node);
+ }
break;
case NODE_MIX_CLOSURE:
svm_node_mix_closure(sd, stack, node);
@@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
break;
case NODE_TEX_COORD:
- svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+ offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
break;
case NODE_VALUE_F:
svm_node_value_f(kg, sd, stack, node.y, node.z);
break;
case NODE_VALUE_V:
- svm_node_value_v(kg, sd, stack, node.y, &offset);
+ offset = svm_node_value_v(kg, sd, stack, node.y, offset);
break;
case NODE_ATTR:
- svm_node_attr(kg, sd, stack, node);
+ svm_node_attr<node_feature_mask>(kg, sd, stack, node);
break;
case NODE_VERTEX_COLOR:
svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
break;
-# if NODES_FEATURE(NODE_FEATURE_BUMP)
case NODE_GEOMETRY_BUMP_DX:
- svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+ }
break;
case NODE_GEOMETRY_BUMP_DY:
- svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+ }
break;
case NODE_SET_DISPLACEMENT:
- svm_node_set_displacement(kg, sd, stack, node.y);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_set_displacement(kg, sd, stack, node.y);
+ }
break;
case NODE_DISPLACEMENT:
- svm_node_displacement(kg, sd, stack, node);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_displacement(kg, sd, stack, node);
+ }
break;
case NODE_VECTOR_DISPLACEMENT:
- svm_node_vector_displacement(kg, sd, stack, node, &offset);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ offset = svm_node_vector_displacement(kg, sd, stack, node, offset);
+ }
break;
-# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
case NODE_TEX_IMAGE:
- svm_node_tex_image(kg, sd, stack, node, &offset);
+ offset = svm_node_tex_image(kg, sd, stack, node, offset);
break;
case NODE_TEX_IMAGE_BOX:
svm_node_tex_image_box(kg, sd, stack, node);
break;
case NODE_TEX_NOISE:
- svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
break;
-# if NODES_FEATURE(NODE_FEATURE_BUMP)
case NODE_SET_BUMP:
- svm_node_set_bump(kg, sd, stack, node);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_set_bump(kg, sd, stack, node);
+ }
break;
case NODE_ATTR_BUMP_DX:
- svm_node_attr_bump_dx(kg, sd, stack, node);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_attr_bump_dx(kg, sd, stack, node);
+ }
break;
case NODE_ATTR_BUMP_DY:
- svm_node_attr_bump_dy(kg, sd, stack, node);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_attr_bump_dy(kg, sd, stack, node);
+ }
break;
case NODE_VERTEX_COLOR_BUMP_DX:
- svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+ }
break;
case NODE_VERTEX_COLOR_BUMP_DY:
- svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+ }
break;
case NODE_TEX_COORD_BUMP_DX:
- svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+ }
break;
case NODE_TEX_COORD_BUMP_DY:
- svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+ }
break;
case NODE_CLOSURE_SET_NORMAL:
- svm_node_set_normal(kg, sd, stack, node.y, node.z);
+ if (KERNEL_NODES_FEATURE(BUMP)) {
+ svm_node_set_normal(kg, sd, stack, node.y, node.z);
+ }
break;
-# if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
case NODE_ENTER_BUMP_EVAL:
- svm_node_enter_bump_eval(kg, sd, stack, node.y);
+ if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+ svm_node_enter_bump_eval(kg, sd, stack, node.y);
+ }
break;
case NODE_LEAVE_BUMP_EVAL:
- svm_node_leave_bump_eval(kg, sd, stack, node.y);
+ if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+ svm_node_leave_bump_eval(kg, sd, stack, node.y);
+ }
break;
-# endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-# endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
case NODE_HSV:
- svm_node_hsv(kg, sd, stack, node, &offset);
+ svm_node_hsv(kg, sd, stack, node);
break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
-#if NODES_GROUP(NODE_GROUP_LEVEL_1)
case NODE_CLOSURE_HOLDOUT:
svm_node_closure_holdout(sd, stack, node);
break;
@@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes(
case NODE_LAYER_WEIGHT:
svm_node_layer_weight(sd, stack, node);
break;
-# if NODES_FEATURE(NODE_FEATURE_VOLUME)
case NODE_CLOSURE_VOLUME:
- svm_node_closure_volume(kg, sd, stack, node, type);
+ if (KERNEL_NODES_FEATURE(VOLUME)) {
+ svm_node_closure_volume<type>(kg, sd, stack, node);
+ }
break;
case NODE_PRINCIPLED_VOLUME:
- svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
+ if (KERNEL_NODES_FEATURE(VOLUME)) {
+ offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+ }
break;
-# endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
case NODE_MATH:
- svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+ svm_node_math(kg, sd, stack, node.y, node.z, node.w);
break;
case NODE_VECTOR_MATH:
- svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_RGB_RAMP:
- svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+ offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
break;
case NODE_GAMMA:
svm_node_gamma(sd, stack, node.y, node.z, node.w);
@@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_brightness(sd, stack, node.y, node.z, node.w);
break;
case NODE_LIGHT_PATH:
- svm_node_light_path(sd, state, stack, node.y, node.z, path_flag);
+ svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag);
break;
case NODE_OBJECT_INFO:
svm_node_object_info(kg, sd, stack, node.y, node.z);
@@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes(
case NODE_PARTICLE_INFO:
svm_node_particle_info(kg, sd, stack, node.y, node.z);
break;
-# if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
+#if defined(__HAIR__)
case NODE_HAIR_INFO:
- svm_node_hair_info(kg, sd, stack, node.y, node.z);
+ if (KERNEL_NODES_FEATURE(HAIR)) {
+ svm_node_hair_info(kg, sd, stack, node.y, node.z);
+ }
break;
-# endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#endif
-#if NODES_GROUP(NODE_GROUP_LEVEL_2)
case NODE_TEXTURE_MAPPING:
- svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+ offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
break;
case NODE_MAPPING:
- svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
+ svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
break;
case NODE_MIN_MAX:
- svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+ offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
break;
case NODE_CAMERA:
svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
@@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_tex_environment(kg, sd, stack, node);
break;
case NODE_TEX_SKY:
- svm_node_tex_sky(kg, sd, stack, node, &offset);
+ offset = svm_node_tex_sky(kg, sd, stack, node, offset);
break;
case NODE_TEX_GRADIENT:
svm_node_tex_gradient(sd, stack, node);
break;
case NODE_TEX_VORONOI:
- svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_tex_voronoi<node_feature_mask>(
+ kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_TEX_MUSGRAVE:
- svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_TEX_WAVE:
- svm_node_tex_wave(kg, sd, stack, node, &offset);
+ offset = svm_node_tex_wave(kg, sd, stack, node, offset);
break;
case NODE_TEX_MAGIC:
- svm_node_tex_magic(kg, sd, stack, node, &offset);
+ offset = svm_node_tex_magic(kg, sd, stack, node, offset);
break;
case NODE_TEX_CHECKER:
svm_node_tex_checker(kg, sd, stack, node);
break;
case NODE_TEX_BRICK:
- svm_node_tex_brick(kg, sd, stack, node, &offset);
+ offset = svm_node_tex_brick(kg, sd, stack, node, offset);
break;
case NODE_TEX_WHITE_NOISE:
- svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+ svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
break;
case NODE_NORMAL:
- svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_LIGHT_FALLOFF:
svm_node_light_falloff(sd, stack, node);
break;
case NODE_IES:
- svm_node_ies(kg, sd, stack, node, &offset);
+ svm_node_ies(kg, sd, stack, node);
break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
-#if NODES_GROUP(NODE_GROUP_LEVEL_3)
case NODE_RGB_CURVES:
case NODE_VECTOR_CURVES:
- svm_node_curves(kg, sd, stack, node, &offset);
+ offset = svm_node_curves(kg, sd, stack, node, offset);
break;
case NODE_TANGENT:
svm_node_tangent(kg, sd, stack, node);
@@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_invert(sd, stack, node.y, node.z, node.w);
break;
case NODE_MIX:
- svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_SEPARATE_VECTOR:
svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
@@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
break;
case NODE_SEPARATE_HSV:
- svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_COMBINE_HSV:
- svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_VECTOR_ROTATE:
svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
@@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes(
svm_node_blackbody(kg, sd, stack, node.y, node.z);
break;
case NODE_MAP_RANGE:
- svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
break;
case NODE_CLAMP:
- svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
+ offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
break;
-# ifdef __SHADER_RAYTRACE__
+#ifdef __SHADER_RAYTRACE__
case NODE_BEVEL:
- svm_node_bevel(kg, sd, state, stack, node);
+ svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
break;
case NODE_AMBIENT_OCCLUSION:
- svm_node_ao(kg, sd, state, stack, node);
+ svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
break;
-# endif /* __SHADER_RAYTRACE__ */
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
+#endif
-#if NODES_GROUP(NODE_GROUP_LEVEL_4)
-# if NODES_FEATURE(NODE_FEATURE_VOLUME)
case NODE_TEX_VOXEL:
- svm_node_tex_voxel(kg, sd, stack, node, &offset);
+ if (KERNEL_NODES_FEATURE(VOLUME)) {
+ offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+ }
break;
-# endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
case NODE_AOV_START:
- if (!svm_node_aov_check(state, buffer)) {
+ if (!svm_node_aov_check(path_flag, render_buffer)) {
return;
}
break;
case NODE_AOV_COLOR:
- svm_node_aov_color(kg, sd, stack, node, buffer);
+ svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
break;
case NODE_AOV_VALUE:
- svm_node_aov_value(kg, sd, stack, node, buffer);
+ svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
default:
kernel_assert(!"Unknown node type was passed to the SVM machine");
return;
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 4cb986b897a..34ac2cb8fbf 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -14,20 +14,25 @@
* limitations under the License.
*/
+#include "kernel/bvh/bvh.h"
+
CCL_NAMESPACE_BEGIN
#ifdef __SHADER_RAYTRACE__
-ccl_device_noinline float svm_ao(KernelGlobals *kg,
- ShaderData *sd,
- float3 N,
- ccl_addr_space PathState *state,
- float max_dist,
- int num_samples,
- int flags)
+# ifdef __KERNEL_OPTIX__
+extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS,
+# else
+ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS,
+# endif
+ ShaderData *sd,
+ float3 N,
+ float max_dist,
+ int num_samples,
+ int flags)
{
if (flags & NODE_AO_GLOBAL_RADIUS) {
- max_dist = kernel_data.background.ao_distance;
+ max_dist = kernel_data.integrator.ao_bounces_distance;
}
/* Early out if no sampling needed. */
@@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
float3 T, B;
make_orthonormals(N, &T, &B);
+ /* TODO: support ray-tracing in shadow shader evaluation? */
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
int unoccluded = 0;
for (int sample = 0; sample < num_samples; sample++) {
float disk_u, disk_v;
- path_branched_rng_2D(
- kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+ path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
float2 d = concentric_sample_disk(disk_u, disk_v);
float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
@@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
ray.D = D.x * T + D.y * B + D.z * N;
ray.t = max_dist;
ray.time = sd->time;
- ray.dP = sd->dP;
- ray.dD = differential3_zero();
+ ray.dP = differential_zero_compact();
+ ray.dD = differential_zero_compact();
if (flags & NODE_AO_ONLY_LOCAL) {
if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
@@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
return ((float)unoccluded) / num_samples;
}
-ccl_device void svm_node_ao(
- KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+# if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+# else
+ccl_device_noinline
+# endif
+ void
+ svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
{
uint flags, dist_offset, normal_offset, out_ao_offset;
svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
@@ -92,7 +106,16 @@ ccl_device void svm_node_ao(
float dist = stack_load_float_default(stack, dist_offset, node.w);
float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
- float ao = svm_ao(kg, sd, normal, state, dist, samples, flags);
+
+ float ao = 1.0f;
+
+ if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+# ifdef __KERNEL_OPTIX__
+ ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+# else
+ ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+# endif
+ }
if (stack_valid(out_ao_offset)) {
stack_store_float(stack, out_ao_offset, ao);
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 899e466d099..26dec9717b3 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -14,36 +14,50 @@
* limitations under the License.
*/
+#include "kernel/kernel_write_passes.h"
+
CCL_NAMESPACE_BEGIN
-ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
- ccl_global float *buffer)
+ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer)
{
- int path_flag = state->flag;
-
bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
- return ((buffer != NULL) && is_primary);
+ return ((render_buffer != NULL) && is_primary);
}
-ccl_device void svm_node_aov_color(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *sd,
+ float *stack,
+ uint4 node,
+ ccl_global float *render_buffer)
{
float3 val = stack_load_float3(stack, node.y);
- if (buffer) {
- kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
- make_float4(val.x, val.y, val.z, 1.0f));
+ if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+ const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+ kernel_data.film.pass_stride;
+ ccl_global float *buffer = render_buffer + render_buffer_offset +
+ (kernel_data.film.pass_aov_color + node.z);
+ kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z));
}
}
-ccl_device void svm_node_aov_value(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS,
+ ShaderData *sd,
+ float *stack,
+ uint4 node,
+ ccl_global float *render_buffer)
{
float val = stack_load_float(stack, node.y);
- if (buffer) {
- kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+ if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+ const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+ const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+ kernel_data.film.pass_stride;
+ ccl_global float *buffer = render_buffer + render_buffer_offset +
+ (kernel_data.film.pass_aov_value + node.z);
+ kernel_write_pass_float(buffer, val);
}
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 62740824ad1..5f94b20af73 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN
/* Attribute Node */
-ccl_device AttributeDescriptor svm_node_attr_init(
- KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset)
+ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg,
+ ShaderData *sd,
+ uint4 node,
+ NodeAttributeOutputType *type,
+ uint *out_offset)
{
*out_offset = node.z;
*type = (NodeAttributeOutputType)node.w;
@@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init(
return desc;
}
-ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+template<uint node_feature_mask>
+ccl_device_noinline void svm_node_attr(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
uint out_offset = 0;
AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
#ifdef __VOLUME__
- /* Volumes
- * NOTE: moving this into its own node type might help improve performance. */
- if (primitive_is_volume_attribute(sd, desc)) {
- const float4 value = volume_attribute_float4(kg, sd, desc);
+ if (KERNEL_NODES_FEATURE(VOLUME)) {
+ /* Volumes
+ * NOTE: moving this into its own node type might help improve performance. */
+ if (primitive_is_volume_attribute(sd, desc)) {
+ const float4 value = volume_attribute_float4(kg, sd, desc);
- if (type == NODE_ATTR_OUTPUT_FLOAT) {
- const float f = volume_attribute_value_to_float(value);
- stack_store_float(stack, out_offset, f);
- }
- else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
- const float3 f = volume_attribute_value_to_float3(value);
- stack_store_float3(stack, out_offset, f);
+ if (type == NODE_ATTR_OUTPUT_FLOAT) {
+ const float f = volume_attribute_value_to_float(value);
+ stack_store_float(stack, out_offset, f);
+ }
+ else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
+ const float3 f = volume_attribute_value_to_float3(value);
+ stack_store_float3(stack, out_offset, f);
+ }
+ else {
+ const float f = volume_attribute_value_to_alpha(value);
+ stack_store_float(stack, out_offset, f);
+ }
+ return;
}
- else {
- const float f = volume_attribute_value_to_alpha(value);
- stack_store_float(stack, out_offset, f);
- }
- return;
}
#endif
@@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
}
}
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
uint out_offset = 0;
@@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
}
}
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
uint out_offset = 0;
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index bf5957ec9e4..9d7ce202d49 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -14,21 +14,95 @@
* limitations under the License.
*/
+#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
CCL_NAMESPACE_BEGIN
#ifdef __SHADER_RAYTRACE__
+/* Planar Cubic BSSRDF falloff, reused for bevel.
+ *
+ * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
+ * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
+ * far as I can tell has no closed form solution. So we get an iterative solution
+ * instead with newton-raphson. */
+
+ccl_device float svm_bevel_cubic_eval(const float radius, float r)
+{
+ const float Rm = radius;
+
+ if (r >= Rm)
+ return 0.0f;
+
+ /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
+ const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
+ const float f = Rm - r;
+ const float num = f * f * f;
+
+ return (10.0f * num) / (Rm5 * M_PI_F);
+}
+
+ccl_device float svm_bevel_cubic_pdf(const float radius, float r)
+{
+ return svm_bevel_cubic_eval(radius, r);
+}
+
+/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
+ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi)
+{
+ /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
+ * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
+ * should not be too bad */
+ const float tolerance = 1e-6f;
+ const int max_iteration_count = 10;
+ float x = 0.25f;
+ int i;
+
+ for (i = 0; i < max_iteration_count; i++) {
+ float x2 = x * x;
+ float x3 = x2 * x;
+ float nx = (1.0f - x);
+
+ float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
+ float f_ = 20.0f * (x * nx) * (nx * nx);
+
+ if (fabsf(f) < tolerance || f_ == 0.0f)
+ break;
+
+ x = saturate(x - f / f_);
+ }
+
+ return x;
+}
+
+ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h)
+{
+ float Rm = radius;
+ float r_ = svm_bevel_cubic_quintic_root_find(xi);
+
+ r_ *= Rm;
+ *r = r_;
+
+ /* h^2 + r^2 = Rm^2 */
+ *h = safe_sqrtf(Rm * Rm - r_ * r_);
+}
+
/* Bevel shader averaging normals from nearby surfaces.
*
* Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
* http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
*/
-ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
- ShaderData *sd,
- ccl_addr_space PathState *state,
- float radius,
- int num_samples)
+# ifdef __KERNEL_OPTIX__
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS,
+# else
+ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS,
+# endif
+ ShaderData *sd,
+ float radius,
+ int num_samples)
{
/* Early out if no sampling needed. */
if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
}
/* Don't bevel for blurry indirect rays. */
- if (state->min_ray_pdf < 8.0f) {
+ if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) {
return sd->N;
}
/* Setup for multi intersection. */
LocalIntersection isect;
- uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+ uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+ INTEGRATOR_STATE(path, rng_offset),
+ INTEGRATOR_STATE(path, sample),
+ 0x64c6a40e);
/* Sample normals from surrounding points on surface. */
float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
+ /* TODO: support ray-tracing in shadow shader evaluation? */
+ RNGState rng_state;
+ path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
for (int sample = 0; sample < num_samples; sample++) {
float disk_u, disk_v;
- path_branched_rng_2D(
- kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+ path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
/* Pick random axis in local frame and point on disk. */
float3 disk_N, disk_T, disk_B;
@@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
float disk_height;
/* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
- bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+ svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height);
float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
@@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
ray->P = sd->P + disk_N * disk_height + disk_P;
ray->D = -disk_N;
ray->t = 2.0f * disk_height;
- ray->dP = sd->dP;
- ray->dD = differential3_zero();
+ ray->dP = differential_zero_compact();
+ ray->dD = differential_zero_compact();
ray->time = sd->time;
/* Intersect with the same object. if multiple intersections are found it
@@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
/* Quickly retrieve P and Ng without setting up ShaderData. */
float3 hit_P;
if (sd->type & PRIMITIVE_TRIANGLE) {
- hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
+ hit_P = triangle_refine_local(
+ kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim);
}
# ifdef __OBJECT_MOTION__
else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
float3 verts[3];
motion_triangle_vertices(
kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
- hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
+ hit_P = motion_triangle_refine_local(
+ kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts);
}
# endif /* __OBJECT_MOTION__ */
@@ -173,7 +255,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
/* Multiple importance sample between 3 axes, power heuristic
* found to be slightly better than balance heuristic. pdf_N
- * in the MIS weight and denominator cancelled out. */
+ * in the MIS weight and denominator canceled out. */
float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
if (isect.num_hits > LOCAL_MAX_HITS) {
w *= isect.num_hits / (float)LOCAL_MAX_HITS;
@@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
float r = len(hit_P - sd->P);
/* Compute weight. */
- float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
- float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+ float pdf = svm_bevel_cubic_pdf(radius, r);
+ float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r);
w *= pdf / disk_pdf;
@@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
}
-ccl_device void svm_node_bevel(
- KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+# if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+# else
+ccl_device_noinline
+# endif
+ void
+ svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
{
uint num_samples, radius_offset, normal_offset, out_offset;
svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
float radius = stack_load_float(stack, radius_offset);
- float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
- if (stack_valid(normal_offset)) {
- /* Preserve input normal. */
- float3 ref_N = stack_load_float3(stack, normal_offset);
- bevel_N = normalize(ref_N + (bevel_N - sd->N));
+ float3 bevel_N = sd->N;
+
+ if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+# ifdef __KERNEL_OPTIX__
+ bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+# else
+ bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+# endif
+
+ if (stack_valid(normal_offset)) {
+ /* Preserve input normal. */
+ float3 ref_N = stack_load_float3(stack, normal_offset);
+ bevel_N = normalize(ref_N + (bevel_N - sd->N));
+ }
}
stack_store_float3(stack, out_offset, bevel_N);
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index adfc50d961e..96b3703b954 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN
/* Blackbody Node */
-ccl_device void svm_node_blackbody(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint temperature_offset,
+ uint col_offset)
{
/* Input */
float temperature = stack_load_float(stack, temperature_offset);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 6984afa30a5..dca1b220dd5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
return make_float2(tint, mortar);
}
-ccl_device void svm_node_tex_brick(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_brick(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
- uint4 node2 = read_node(kg, offset);
- uint4 node3 = read_node(kg, offset);
- uint4 node4 = read_node(kg, offset);
+ uint4 node2 = read_node(kg, &offset);
+ uint4 node3 = read_node(kg, &offset);
+ uint4 node4 = read_node(kg, &offset);
/* Input and Output Sockets */
uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
@@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick(
stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f);
if (stack_valid(fac_offset))
stack_store_float(stack, fac_offset, f);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9554b5946fb..2ed812acd71 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_brightness(
+ccl_device_noinline void svm_node_brightness(
ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
{
uint bright_offset, contrast_offset;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index c9d430a2bba..8672839dbab 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
/* Bump Eval Nodes */
-ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint offset)
+ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint offset)
{
/* save state */
stack_store_float3(stack, offset + 0, sd->P);
@@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
}
}
-ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint offset)
+ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint offset)
{
/* restore state */
sd->P = stack_load_float3(stack, offset + 0);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 21a17acf5f1..40c0edcdad0 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,12 +16,12 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_camera(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint out_vector,
- uint out_zdepth,
- uint out_distance)
+ccl_device_noinline void svm_node_camera(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint out_vector,
+ uint out_zdepth,
+ uint out_distance)
{
float distance;
float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index d54cb73df91..a9919c9ddc9 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p)
return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f;
}
-ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint co_offset, color1_offset, color2_offset, scale_offset;
uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
index a85fd82754e..656bd31c085 100644
--- a/intern/cycles/kernel/svm/svm_clamp.h
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN
/* Clamp Node */
-ccl_device void svm_node_clamp(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint value_stack_offset,
- uint parameters_stack_offsets,
- uint result_stack_offset,
- int *offset)
+ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint value_stack_offset,
+ uint parameters_stack_offsets,
+ uint result_stack_offset,
+ int offset)
{
uint min_stack_offset, max_stack_offset, type;
svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
- uint4 defaults = read_node(kg, offset);
+ uint4 defaults = read_node(kg, &offset);
float value = stack_load_float(stack, value_stack_offset);
float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
@@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg,
else {
stack_store_float(stack, result_stack_offset, clamp(value, min, max));
}
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bbe8d72edf0..e2f6dde4ace 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup(
}
}
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint4 node,
- ShaderType shader_type,
- int path_flag,
- int *offset)
+template<uint node_feature_mask, ShaderType shader_type>
+ccl_device_noinline int svm_node_closure_bsdf(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
{
uint type, param1_offset, param2_offset;
@@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
1.0f);
/* note we read this extra node before weight check, so offset is added */
- uint4 data_node = read_node(kg, offset);
+ uint4 data_node = read_node(kg, &offset);
/* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
- if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+ if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) {
if (type == CLOSURE_BSDF_PRINCIPLED_ID) {
/* Read all principled BSDF extra data to get the right offset. */
- read_node(kg, offset);
- read_node(kg, offset);
- read_node(kg, offset);
- read_node(kg, offset);
+ read_node(kg, &offset);
+ read_node(kg, &offset);
+ read_node(kg, &offset);
+ read_node(kg, &offset);
}
- return;
+ return offset;
}
float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
@@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
eta_offset, transmission_offset, anisotropic_rotation_offset,
transmission_roughness_offset;
- uint4 data_node2 = read_node(kg, offset);
+ uint4 data_node2 = read_node(kg, &offset);
float3 T = stack_load_float3(stack, data_node.y);
svm_unpack_node_uchar4(data_node.z,
@@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
float specular_weight = (1.0f - final_transmission);
// get the base color
- uint4 data_base_color = read_node(kg, offset);
+ uint4 data_base_color = read_node(kg, &offset);
float3 base_color = stack_valid(data_base_color.x) ?
stack_load_float3(stack, data_base_color.x) :
make_float3(__uint_as_float(data_base_color.y),
@@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
__uint_as_float(data_base_color.w));
// get the additional clearcoat normal and subsurface scattering radius
- uint4 data_cn_ssr = read_node(kg, offset);
+ uint4 data_cn_ssr = read_node(kg, &offset);
float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
stack_load_float3(stack, data_cn_ssr.x) :
sd->N;
float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
stack_load_float3(stack, data_cn_ssr.y) :
make_float3(1.0f, 1.0f, 1.0f);
+ float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
+ 1.4f;
+ float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
+ stack_load_float(stack, data_cn_ssr.w) :
+ 0.0f;
// get the subsurface color
- uint4 data_subsurface_color = read_node(kg, offset);
+ uint4 data_subsurface_color = read_node(kg, &offset);
float3 subsurface_color = stack_valid(data_subsurface_color.x) ?
stack_load_float3(stack, data_subsurface_color.x) :
make_float3(__uint_as_float(data_subsurface_color.y),
@@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
if (bssrdf) {
bssrdf->radius = subsurface_radius * subsurface;
- bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ?
- subsurface_color :
- mixed_ss_base_color;
- bssrdf->texture_blur = 0.0f;
- bssrdf->sharpness = 0.0f;
+ bssrdf->albedo = mixed_ss_base_color;
bssrdf->N = N;
bssrdf->roughness = roughness;
+ /* Clamps protecting against bad/extreme and non physical values. */
+ subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f);
+ bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
/* setup bsdf */
- sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+ sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior);
}
}
}
@@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
}
#ifdef __HAIR__
case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: {
- uint4 data_node2 = read_node(kg, offset);
- uint4 data_node3 = read_node(kg, offset);
- uint4 data_node4 = read_node(kg, offset);
+ uint4 data_node2 = read_node(kg, &offset);
+ uint4 data_node3 = read_node(kg, &offset);
+ uint4 data_node4 = read_node(kg, &offset);
float3 weight = sd->svm_closure_weight * mix_weight;
@@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
#endif /* __HAIR__ */
#ifdef __SUBSURFACE__
- case CLOSURE_BSSRDF_CUBIC_ID:
- case CLOSURE_BSSRDF_GAUSSIAN_ID:
- case CLOSURE_BSSRDF_BURLEY_ID:
- case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+ case CLOSURE_BSSRDF_RANDOM_WALK_ID:
+ case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
float3 weight = sd->svm_closure_weight * mix_weight;
Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
@@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
bssrdf->albedo = sd->svm_closure_weight;
- bssrdf->texture_blur = param2;
- bssrdf->sharpness = stack_load_float(stack, data_node.w);
bssrdf->N = N;
- bssrdf->roughness = 0.0f;
- sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+ bssrdf->roughness = FLT_MAX;
+
+ const float subsurface_ior = clamp(param2, 1.01f, 3.8f);
+ const float subsurface_anisotropy = stack_load_float(stack, data_node.w);
+ bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
+ sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior);
}
break;
@@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
default:
break;
}
+
+ return offset;
}
-ccl_device void svm_node_closure_volume(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
+template<ShaderType shader_type>
+ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
#ifdef __VOLUME__
/* Only sum extinction for volumes, variable is shared with surface transparency. */
@@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume(
#endif
}
-ccl_device void svm_node_principled_volume(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint4 node,
- ShaderType shader_type,
- int path_flag,
- int *offset)
+template<ShaderType shader_type>
+ccl_device_noinline int svm_node_principled_volume(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
{
#ifdef __VOLUME__
- uint4 value_node = read_node(kg, offset);
- uint4 attr_node = read_node(kg, offset);
+ uint4 value_node = read_node(kg, &offset);
+ uint4 attr_node = read_node(kg, &offset);
/* Only sum extinction for volumes, variable is shared with surface transparency. */
if (shader_type != SHADER_TYPE_VOLUME) {
- return;
+ return offset;
}
uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
@@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
1.0f);
if (mix_weight == 0.0f) {
- return;
+ return offset;
}
/* Compute density. */
@@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
/* Compute emission. */
if (path_flag & PATH_RAY_SHADOW) {
/* Don't need emission for shadows. */
- return;
+ return offset;
}
uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
@@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
}
}
#endif
+ return offset;
}
-ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
{
uint mix_weight_offset = node.y;
float3 weight = sd->svm_closure_weight;
@@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
emission_setup(sd, weight);
}
-ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
{
uint mix_weight_offset = node.y;
float3 weight = sd->svm_closure_weight;
@@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
background_setup(sd, weight);
}
-ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
{
uint mix_weight_offset = node.y;
@@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
{
float3 weight = stack_load_float3(stack, weight_offset);
-
svm_node_closure_store_weight(sd, weight);
}
-ccl_device void svm_node_emission_weight(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint4 node)
+ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint color_offset = node.y;
uint strength_offset = node.z;
@@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg,
svm_node_closure_store_weight(sd, weight);
}
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
{
/* fetch weight from blend input, previous mix closures,
* and write to stack to be used by closure nodes later */
@@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
/* (Bump) normal */
ccl_device void svm_node_set_normal(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
{
float3 normal = stack_load_float3(stack, in_direction);
sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 5df6c9fb755..37d40167ccc 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
/* Conversion Nodes */
-ccl_device void svm_node_convert(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device_noinline void svm_node_convert(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
{
switch (type) {
case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 250fac6bcb8..a1d952173d8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -14,11 +14,16 @@
* limitations under the License.
*/
+#include "kernel/kernel_montecarlo.h"
+
CCL_NAMESPACE_BEGIN
/* Bump Node */
-ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
#ifdef __RAY_DIFFERENTIALS__
/* get normal input */
@@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
/* Displacement Node */
-ccl_device void svm_node_set_displacement(KernelGlobals *kg,
+ccl_device void svm_node_set_displacement(const KernelGlobals *kg,
ShaderData *sd,
float *stack,
uint fac_offset)
@@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
sd->P += dP;
}
-ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint height_offset, midlevel_offset, scale_offset, normal_offset;
svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
@@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *
stack_store_float3(stack, node.z, dP);
}
-ccl_device void svm_node_vector_displacement(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_vector_displacement(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
- uint4 data_node = read_node(kg, offset);
+ uint4 data_node = read_node(kg, &offset);
uint space = data_node.x;
uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
@@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement(
}
stack_store_float3(stack, displacement_offset, dP);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 96d602e35bf..b5ecdbe2abf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
/* Fresnel Node */
-ccl_device void svm_node_fresnel(
+ccl_device_noinline void svm_node_fresnel(
ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
{
uint normal_offset, out_offset;
@@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel(
/* Layer Weight Node */
-ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
{
uint blend_offset = node.y;
uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 65eb08eb0eb..f6fafdee941 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_gamma(
+ccl_device_noinline void svm_node_gamma(
ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
{
float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index e48e96dcfa4..10e9f291d0e 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
/* Geometry Node */
-ccl_device_inline void svm_node_geometry(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
float3 data;
@@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry(
stack_store_float3(stack, out_offset, data);
}
-ccl_device void svm_node_geometry_bump_dx(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dx(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
#ifdef __RAY_DIFFERENTIALS__
float3 data;
@@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx(
#endif
}
-ccl_device void svm_node_geometry_bump_dy(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dy(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
#ifdef __RAY_DIFFERENTIALS__
float3 data;
@@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy(
/* Object Info */
-ccl_device void svm_node_object_info(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_object_info(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
float data;
@@ -140,8 +140,8 @@ ccl_device void svm_node_object_info(
/* Particle Info */
-ccl_device void svm_node_particle_info(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_particle_info(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
switch (type) {
case NODE_INFO_PAR_INDEX: {
@@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info(
/* Hair Info */
-ccl_device void svm_node_hair_info(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_hair_info(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
{
float data;
float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 08304bc47e8..cd15f7097e7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
return 0.0f;
}
-ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
{
uint type, co_offset, color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index c299cf58c7f..6f49a8385aa 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,8 +19,10 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_hsv(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint in_color_offset, fac_offset, out_color_offset;
uint hue_offset, sat_offset, val_offset;
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 56c804b44d0..9c13734ecf0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
/* IES Light */
ccl_device_inline float interpolate_ies_vertical(
- KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
+ const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
{
/* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
* of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
@@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical(
return cubic_interp(a, b, c, d, v_frac);
}
-ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
+ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg,
int slot,
float h_angle,
float v_angle)
@@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
return max(cubic_interp(a, b, c, d, h_frac), 0.0f);
}
-ccl_device void svm_node_ies(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_ies(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint vector_offset, strength_offset, fac_offset, slot = node.z;
svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 9348ddabde5..a344f36977a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
+ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags)
{
if (id == -1) {
return make_float4(
@@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co)
return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
}
-ccl_device void svm_node_tex_image(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_image(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
uint co_offset, out_offset, alpha_offset, flags;
@@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image(
int num_nodes = (int)node.y;
if (num_nodes > 0) {
/* Remember the offset of the node following the tile nodes. */
- int next_offset = (*offset) + num_nodes;
+ int next_offset = offset + num_nodes;
/* Find the tile that the UV lies in. */
int tx = (int)tex_co.x;
@@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image(
/* Find the index of the tile. */
for (int i = 0; i < num_nodes; i++) {
- uint4 tile_node = read_node(kg, offset);
+ uint4 tile_node = read_node(kg, &offset);
if (tile_node.x == tile) {
id = tile_node.y;
break;
@@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image(
}
/* Skip over the remaining nodes. */
- *offset = next_offset;
+ offset = next_offset;
}
else {
id = -num_nodes;
@@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image(
stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
if (stack_valid(alpha_offset))
stack_store_float(stack, alpha_offset, f.w);
+ return offset;
}
-ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
/* get object space normal */
float3 N = sd->N;
@@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
stack_store_float(stack, alpha_offset, f.w);
}
-ccl_device void svm_node_tex_environment(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint4 node)
+ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint id = node.y;
uint co_offset, out_offset, alpha_offset, flags;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 02024742b13..27cdaaff473 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor)
return factor * (1.0f - color) + (1.0f - factor) * color;
}
-ccl_device void svm_node_invert(
+ccl_device_noinline void svm_node_invert(
ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
{
float factor = stack_load_float(stack, in_fac);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 768c65918cd..49fabad1cc5 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
/* Light Path Node */
-ccl_device void svm_node_light_path(ShaderData *sd,
- ccl_addr_space PathState *state,
- float *stack,
- uint type,
- uint out_offset,
- int path_flag)
+ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS,
+ const ShaderData *sd,
+ float *stack,
+ uint type,
+ uint out_offset,
+ int path_flag)
{
float info = 0.0f;
@@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd,
case NODE_LP_ray_length:
info = sd->ray_length;
break;
- case NODE_LP_ray_depth:
- info = (float)state->bounce;
+ case NODE_LP_ray_depth: {
+ /* Read bounce from difference location depending if this is a shadow
+ * path. It's a bit dubious to have integrate state details leak into
+ * this function but hard to avoid currently. */
+ int bounce = (INTEGRATOR_STATE_IS_NULL) ? 0 :
+ (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) :
+ INTEGRATOR_STATE(path, bounce);
+
+ /* For background, light emission and shadow evaluation we from a
+ * surface or volume we are effective one bounce further. */
+ if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+ bounce++;
+ }
+
+ info = (float)bounce;
break;
+ }
+ /* TODO */
+ case NODE_LP_ray_transparent: {
+ const int bounce = (INTEGRATOR_STATE_IS_NULL) ?
+ 0 :
+ (path_flag & PATH_RAY_SHADOW) ?
+ INTEGRATOR_STATE(shadow_path, transparent_bounce) :
+ INTEGRATOR_STATE(path, transparent_bounce);
+
+ info = (float)bounce;
+ break;
+ }
+#if 0
case NODE_LP_ray_diffuse:
info = (float)state->diffuse_bounce;
break;
case NODE_LP_ray_glossy:
info = (float)state->glossy_bounce;
break;
- case NODE_LP_ray_transparent:
- info = (float)state->transparent_bounce;
- break;
+#endif
+#if 0
case NODE_LP_ray_transmission:
info = (float)state->transmission_bounce;
break;
+#endif
}
stack_store_float(stack, out_offset, info);
@@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd,
/* Light Falloff Node */
-ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
{
uint strength_offset, out_offset, smooth_offset;
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 9c160e6d8cc..8784c760860 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
}
-ccl_device void svm_node_tex_magic(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_magic(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
uint depth;
uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
@@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic(
svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
- uint4 node2 = read_node(kg, offset);
+ uint4 node2 = read_node(kg, &offset);
float3 co = stack_load_float3(stack, co_offset);
float scale = stack_load_float_default(stack, scale_offset, node2.x);
float distortion = stack_load_float_default(stack, distortion_offset, node2.y);
@@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic(
stack_store_float(stack, fac_offset, average(color));
if (stack_valid(color_offset))
stack_store_float3(stack, color_offset, color);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
index 533a631c837..c8684981e31 100644
--- a/intern/cycles/kernel/svm/svm_map_range.h
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x)
return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
}
-ccl_device void svm_node_map_range(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint value_stack_offset,
- uint parameters_stack_offsets,
- uint results_stack_offsets,
- int *offset)
+ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint value_stack_offset,
+ uint parameters_stack_offsets,
+ uint results_stack_offsets,
+ int offset)
{
uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
uint type_stack_offset, steps_stack_offset, result_stack_offset;
@@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
svm_unpack_node_uchar3(
results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
- uint4 defaults = read_node(kg, offset);
- uint4 defaults2 = read_node(kg, offset);
+ uint4 defaults = read_node(kg, &offset);
+ uint4 defaults2 = read_node(kg, &offset);
float value = stack_load_float(stack, value_stack_offset);
float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
@@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
result = 0.0f;
}
stack_store_float(stack, result_stack_offset, result);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 6e19c859e19..fcc724405f5 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN
/* Mapping Node */
-ccl_device void svm_node_mapping(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint type,
- uint inputs_stack_offsets,
- uint result_stack_offset,
- int *offset)
+ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint type,
+ uint inputs_stack_offsets,
+ uint result_stack_offset)
{
uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
svm_unpack_node_uchar4(inputs_stack_offsets,
@@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg,
/* Texture Mapping */
-ccl_device void svm_node_texture_mapping(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint vec_offset,
+ uint out_offset,
+ int offset)
{
float3 v = stack_load_float3(stack, vec_offset);
Transform tfm;
- tfm.x = read_node_float(kg, offset);
- tfm.y = read_node_float(kg, offset);
- tfm.z = read_node_float(kg, offset);
+ tfm.x = read_node_float(kg, &offset);
+ tfm.y = read_node_float(kg, &offset);
+ tfm.z = read_node_float(kg, &offset);
float3 r = transform_point(&tfm, v);
stack_store_float3(stack, out_offset, r);
+ return offset;
}
-ccl_device void svm_node_min_max(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint vec_offset,
+ uint out_offset,
+ int offset)
{
float3 v = stack_load_float3(stack, vec_offset);
- float3 mn = float4_to_float3(read_node_float(kg, offset));
- float3 mx = float4_to_float3(read_node_float(kg, offset));
+ float3 mn = float4_to_float3(read_node_float(kg, &offset));
+ float3 mx = float4_to_float3(read_node_float(kg, &offset));
float3 r = min(max(mn, v), mx);
stack_store_float3(stack, out_offset, r);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 733ea28f9e5..99e7a8f2bda 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,13 +16,12 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_math(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint type,
- uint inputs_stack_offsets,
- uint result_stack_offset,
- int *offset)
+ccl_device_noinline void svm_node_math(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint type,
+ uint inputs_stack_offsets,
+ uint result_stack_offset)
{
uint a_stack_offset, b_stack_offset, c_stack_offset;
svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
@@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg,
stack_store_float(stack, result_stack_offset, result);
}
-ccl_device void svm_node_vector_math(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint type,
- uint inputs_stack_offsets,
- uint outputs_stack_offsets,
- int *offset)
+ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint type,
+ uint inputs_stack_offsets,
+ uint outputs_stack_offsets,
+ int offset)
{
uint value_stack_offset, vector_stack_offset;
uint a_stack_offset, b_stack_offset, param1_stack_offset;
@@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
/* 3 Vector Operators */
if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
- uint4 extra_node = read_node(kg, offset);
+ uint4 extra_node = read_node(kg, &offset);
c = stack_load_float3(stack, extra_node.x);
}
@@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
stack_store_float(stack, value_stack_offset, value);
if (stack_valid(vector_stack_offset))
stack_store_float3(stack, vector_stack_offset, vector);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 15114bfd5e4..3e38080977f 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN
/* Node */
-ccl_device void svm_node_mix(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint fac_offset,
- uint c1_offset,
- uint c2_offset,
- int *offset)
+ccl_device_noinline int svm_node_mix(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint fac_offset,
+ uint c1_offset,
+ uint c2_offset,
+ int offset)
{
/* read extra data */
- uint4 node1 = read_node(kg, offset);
+ uint4 node1 = read_node(kg, &offset);
float fac = stack_load_float(stack, fac_offset);
float3 c1 = stack_load_float3(stack, c1_offset);
@@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg,
float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
stack_store_float3(stack, node1.z, result);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 571f62fe27f..03a8b68b3ef 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
return value;
}
-ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint offsets1,
- uint offsets2,
- uint offsets3,
- int *offset)
+ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint offsets1,
+ uint offsets2,
+ uint offsets3,
+ int offset)
{
uint type, dimensions, co_stack_offset, w_stack_offset;
uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
@@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
&lacunarity_stack_offset);
svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
- uint4 defaults1 = read_node(kg, offset);
- uint4 defaults2 = read_node(kg, offset);
+ uint4 defaults1 = read_node(kg, &offset);
+ uint4 defaults2 = read_node(kg, &offset);
float3 co = stack_load_float3(stack, co_stack_offset);
float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
}
stack_store_float(stack, fac_stack_offset, fac);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 94d8bfde555..ecb4df6afdf 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
* |__________________________|
*
*/
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
{
ssei XY;
ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
@@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
* v7 (1, 1, 1)
*
*/
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
{
ssei XYZ;
ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
* v15 (1, 1, 1, 1)
*
*/
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
{
ssei XYZW;
ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
@@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
* |__________________________|
*
*/
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
{
ssei XYZ;
ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
* v15 (1, 1, 1, 1)
*
*/
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
{
ssei XYZW;
ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 61fd9553802..29b262ac06e 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co,
}
}
-ccl_device void svm_node_tex_noise(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint dimensions,
- uint offsets1,
- uint offsets2,
- int *offset)
+ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint dimensions,
+ uint offsets1,
+ uint offsets2,
+ int offset)
{
uint vector_stack_offset, w_stack_offset, scale_stack_offset;
uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
@@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
&value_stack_offset,
&color_stack_offset);
- uint4 defaults1 = read_node(kg, offset);
- uint4 defaults2 = read_node(kg, offset);
+ uint4 defaults1 = read_node(kg, &offset);
+ uint4 defaults2 = read_node(kg, &offset);
float3 vector = stack_load_float3(stack, vector_stack_offset);
float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
if (stack_valid(color_stack_offset)) {
stack_store_float3(stack, color_stack_offset, color);
}
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 4cd3eab0ed2..724b5f281f9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,16 +16,16 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_normal(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint in_normal_offset,
- uint out_normal_offset,
- uint out_dot_offset,
- int *offset)
+ccl_device_noinline int svm_node_normal(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint in_normal_offset,
+ uint out_normal_offset,
+ uint out_dot_offset,
+ int offset)
{
/* read extra data */
- uint4 node1 = read_node(kg, offset);
+ uint4 node1 = read_node(kg, &offset);
float3 normal = stack_load_float3(stack, in_normal_offset);
float3 direction;
@@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg,
if (stack_valid(out_dot_offset))
stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 85ccf39144b..e92df3c093c 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN
/* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
-ccl_device_inline float4 rgb_ramp_lookup(
- KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
+ int offset,
+ float f,
+ bool interpolate,
+ bool extrapolate,
+ int table_size)
{
if ((f < 0.0f || f > 1.0f) && extrapolate) {
float4 t0, dy;
@@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup(
return a;
}
-ccl_device void svm_node_rgb_ramp(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_rgb_ramp(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
uint fac_offset, color_offset, alpha_offset;
uint interpolate = node.z;
svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
- uint table_size = read_node(kg, offset).x;
+ uint table_size = read_node(kg, &offset).x;
float fac = stack_load_float(stack, fac_offset);
- float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size);
+ float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size);
if (stack_valid(color_offset))
stack_store_float3(stack, color_offset, float4_to_float3(color));
if (stack_valid(alpha_offset))
stack_store_float(stack, alpha_offset, color.w);
- *offset += table_size;
+ offset += table_size;
+ return offset;
}
-ccl_device void svm_node_curves(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_curves(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
uint fac_offset, color_offset, out_offset;
svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
- uint table_size = read_node(kg, offset).x;
+ uint table_size = read_node(kg, &offset).x;
float fac = stack_load_float(stack, fac_offset);
float3 color = stack_load_float3(stack, color_offset);
@@ -89,14 +94,15 @@ ccl_device void svm_node_curves(
const float range_x = max_x - min_x;
const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
- float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
- float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
- float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
+ float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x;
+ float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y;
+ float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z;
color = (1.0f - fac) * color + fac * make_float3(r, g, b);
stack_store_float3(stack, out_offset, color);
- *offset += table_size;
+ offset += table_size;
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index f501252062e..8d52845ea3d 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,15 +16,15 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint hue_in,
- uint saturation_in,
- uint value_in,
- int *offset)
+ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint hue_in,
+ uint saturation_in,
+ uint value_in,
+ int offset)
{
- uint4 node1 = read_node(kg, offset);
+ uint4 node1 = read_node(kg, &offset);
uint color_out = node1.y;
float hue = stack_load_float(stack, hue_in);
@@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
if (stack_valid(color_out))
stack_store_float3(stack, color_out, color);
+ return offset;
}
-ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint color_in,
- uint hue_out,
- uint saturation_out,
- int *offset)
+ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint color_in,
+ uint hue_out,
+ uint saturation_out,
+ int offset)
{
- uint4 node1 = read_node(kg, offset);
+ uint4 node1 = read_node(kg, &offset);
uint value_out = node1.y;
float3 color = stack_load_float3(stack, color_in);
@@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
stack_store_float(stack, saturation_out, color.y);
if (stack_valid(value_out))
stack_store_float(stack, value_out, color.z);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index b908732f026..b77c4311e72 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
(1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
}
-ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg,
float3 dir,
float sunphi,
float suntheta,
@@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
configuration[6] * mieM + configuration[7] * zenith);
}
-ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg,
float3 dir,
float sunphi,
float suntheta,
@@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon)
return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
}
-ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg,
float3 dir,
float *nishita_data,
uint texture_id)
@@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
return xyz_to_rgb(kg, xyz);
}
-ccl_device void svm_node_tex_sky(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_sky(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
/* Load data */
uint dir_offset = node.y;
@@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky(
float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
float config_x[9], config_y[9], config_z[9];
- float4 data = read_node_float(kg, offset);
+ float4 data = read_node_float(kg, &offset);
sunphi = data.x;
suntheta = data.y;
radiance_x = data.z;
radiance_y = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
radiance_z = data.x;
config_x[0] = data.y;
config_x[1] = data.z;
config_x[2] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_x[3] = data.x;
config_x[4] = data.y;
config_x[5] = data.z;
config_x[6] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_x[7] = data.x;
config_x[8] = data.y;
config_y[0] = data.z;
config_y[1] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_y[2] = data.x;
config_y[3] = data.y;
config_y[4] = data.z;
config_y[5] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_y[6] = data.x;
config_y[7] = data.y;
config_y[8] = data.z;
config_z[0] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_z[1] = data.x;
config_z[2] = data.y;
config_z[3] = data.z;
config_z[4] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
config_z[5] = data.x;
config_z[6] = data.y;
config_z[7] = data.z;
@@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky(
/* Define variables */
float nishita_data[10];
- float4 data = read_node_float(kg, offset);
+ float4 data = read_node_float(kg, &offset);
nishita_data[0] = data.x;
nishita_data[1] = data.y;
nishita_data[2] = data.z;
nishita_data[3] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
nishita_data[4] = data.x;
nishita_data[5] = data.y;
nishita_data[6] = data.z;
nishita_data[7] = data.w;
- data = read_node_float(kg, offset);
+ data = read_node_float(kg, &offset);
nishita_data[8] = data.x;
nishita_data[9] = data.y;
uint texture_id = __float_as_uint(data.z);
@@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky(
}
stack_store_float3(stack, out_offset, f);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 46600551cc4..a35253080da 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -14,12 +14,16 @@
* limitations under the License.
*/
+#include "kernel/geom/geom.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_montecarlo.h"
+
CCL_NAMESPACE_BEGIN
/* Texture Coordinate Node */
-ccl_device void svm_node_tex_coord(
- KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord(
+ const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
{
float3 data;
uint type = node.y;
@@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord(
}
else {
Transform tfm;
- tfm.x = read_node_float(kg, offset);
- tfm.y = read_node_float(kg, offset);
- tfm.z = read_node_float(kg, offset);
+ tfm.x = read_node_float(kg, &offset);
+ tfm.y = read_node_float(kg, &offset);
+ tfm.z = read_node_float(kg, &offset);
data = transform_point(&tfm, data);
}
break;
@@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord(
}
stack_store_float3(stack, out_offset, data);
+ return offset;
}
-ccl_device void svm_node_tex_coord_bump_dx(
- KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dx(
+ const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
{
#ifdef __RAY_DIFFERENTIALS__
float3 data;
@@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx(
}
else {
Transform tfm;
- tfm.x = read_node_float(kg, offset);
- tfm.y = read_node_float(kg, offset);
- tfm.z = read_node_float(kg, offset);
+ tfm.x = read_node_float(kg, &offset);
+ tfm.y = read_node_float(kg, &offset);
+ tfm.z = read_node_float(kg, &offset);
data = transform_point(&tfm, data);
}
break;
@@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx(
case NODE_TEXCO_WINDOW: {
if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+ data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
else
data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
data.z = 0.0f;
@@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx(
}
stack_store_float3(stack, out_offset, data);
+ return offset;
#else
- svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+ return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
#endif
}
-ccl_device void svm_node_tex_coord_bump_dy(
- KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dy(
+ const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
{
#ifdef __RAY_DIFFERENTIALS__
float3 data;
@@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(
}
else {
Transform tfm;
- tfm.x = read_node_float(kg, offset);
- tfm.y = read_node_float(kg, offset);
- tfm.z = read_node_float(kg, offset);
+ tfm.x = read_node_float(kg, &offset);
+ tfm.y = read_node_float(kg, &offset);
+ tfm.z = read_node_float(kg, &offset);
data = transform_point(&tfm, data);
}
break;
@@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
case NODE_TEXCO_WINDOW: {
if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+ data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
else
data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
data.z = 0.0f;
@@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy(
}
stack_store_float3(stack, out_offset, data);
+ return offset;
#else
- svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+ return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
#endif
}
-ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint color_offset, strength_offset, normal_offset, space;
svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
stack_store_float3(stack, normal_offset, N);
}
-ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint tangent_offset, direction_type, axis;
svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 062afcfa5ac..c053be96c51 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN
/* Nodes */
-/* Known frequencies of used nodes, used for selective nodes compilation
- * in the kernel. Currently only affects split OpenCL kernel.
- *
- * Keep as defines so it's easy to check which nodes are to be compiled
- * from preprocessor.
- *
- * Lower the number of group more often the node is used.
- */
-#define NODE_GROUP_LEVEL_0 0
-#define NODE_GROUP_LEVEL_1 1
-#define NODE_GROUP_LEVEL_2 2
-#define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_4 4
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
-
-#define NODE_FEATURE_VOLUME (1 << 0)
-#define NODE_FEATURE_HAIR (1 << 1)
-#define NODE_FEATURE_BUMP (1 << 2)
-#define NODE_FEATURE_BUMP_STATE (1 << 3)
-#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
-/* TODO(sergey): Consider using something like ((uint)(-1)).
- * Need to check carefully operand types around usage of this
- * define first.
- */
-#define NODE_FEATURE_ALL \
- (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
- NODE_FEATURE_VORONOI_EXTRA)
-
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
typedef enum ShaderNodeType {
NODE_END = 0,
NODE_SHADER_JUMP,
@@ -572,12 +541,8 @@ typedef enum ClosureType {
CLOSURE_BSDF_TRANSPARENT_ID,
/* BSSRDF */
- CLOSURE_BSSRDF_CUBIC_ID,
- CLOSURE_BSSRDF_GAUSSIAN_ID,
- CLOSURE_BSSRDF_PRINCIPLED_ID,
- CLOSURE_BSSRDF_BURLEY_ID,
CLOSURE_BSSRDF_RANDOM_WALK_ID,
- CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
+ CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID,
/* Other */
CLOSURE_HOLDOUT_ID,
@@ -620,11 +585,9 @@ typedef enum ClosureType {
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
#define CLOSURE_IS_BSSRDF(type) \
- (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
-#define CLOSURE_IS_DISK_BSSRDF(type) \
- (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+ (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
#define CLOSURE_IS_VOLUME(type) \
(type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
#define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 5b76f2c8832..d0478660094 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN
/* Value Nodes */
ccl_device void svm_node_value_f(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
{
stack_store_float(stack, out_offset, __uint_as_float(ivalue));
}
-ccl_device void svm_node_value_v(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device int svm_node_value_v(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset)
{
/* read extra data */
- uint4 node1 = read_node(kg, offset);
+ uint4 node1 = read_node(kg, &offset);
float3 p = make_float3(
__uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w));
stack_store_float3(stack, out_offset, p);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
index 50045752484..55e1bce0158 100644
--- a/intern/cycles/kernel/svm/svm_vector_rotate.h
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN
/* Vector Rotate */
-ccl_device void svm_node_vector_rotate(ShaderData *sd,
- float *stack,
- uint input_stack_offsets,
- uint axis_stack_offsets,
- uint result_stack_offset)
+ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd,
+ float *stack,
+ uint input_stack_offsets,
+ uint axis_stack_offsets,
+ uint result_stack_offset)
{
uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
angle_stack_offset, invert;
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e95492cf1b..8aedb7e0f54 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
/* Vector Transform */
-ccl_device void svm_node_vector_transform(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint4 node)
+ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint itype, ifrom, ito;
uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
index 0aa45835522..986ea244f3a 100644
--- a/intern/cycles/kernel/svm/svm_vertex_color.h
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -16,12 +16,12 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_vertex_color(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint layer_id,
- uint color_offset,
- uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint layer_id,
+ uint color_offset,
+ uint alpha_offset)
{
AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg,
}
}
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
- void
- svm_node_vertex_color_bump_dx(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint layer_id,
- uint color_offset,
- uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint layer_id,
+ uint color_offset,
+ uint alpha_offset)
{
AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -62,18 +56,12 @@ ccl_device_noinline
}
}
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
- void
- svm_node_vertex_color_bump_dy(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint layer_id,
- uint color_offset,
- uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint layer_id,
+ uint color_offset,
+ uint alpha_offset)
{
AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
if (descriptor.offset != ATTR_STD_NOT_FOUND) {
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index d0e7db35fab..b1d2eff7f37 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float
*outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
}
-ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint dimensions,
- uint feature,
- uint metric,
- int *offset)
+template<uint node_feature_mask>
+ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint dimensions,
+ uint feature,
+ uint metric,
+ int offset)
{
- uint4 stack_offsets = read_node(kg, offset);
- uint4 defaults = read_node(kg, offset);
+ uint4 stack_offsets = read_node(kg, &offset);
+ uint4 defaults = read_node(kg, &offset);
uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
@@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
&color_out,
&position_out_2d);
break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
case NODE_VORONOI_SMOOTH_F1:
- voronoi_smooth_f1_2d(coord_2d,
- smoothness,
- exponent,
- randomness,
- voronoi_metric,
- &distance_out,
- &color_out,
- &position_out_2d);
+ if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+ voronoi_smooth_f1_2d(coord_2d,
+ smoothness,
+ exponent,
+ randomness,
+ voronoi_metric,
+ &distance_out,
+ &color_out,
+ &position_out_2d);
+ }
break;
-#endif
case NODE_VORONOI_F2:
voronoi_f2_2d(coord_2d,
exponent,
@@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
&color_out,
&position_out);
break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
case NODE_VORONOI_SMOOTH_F1:
- voronoi_smooth_f1_3d(coord,
- smoothness,
- exponent,
- randomness,
- voronoi_metric,
- &distance_out,
- &color_out,
- &position_out);
+ if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+ voronoi_smooth_f1_3d(coord,
+ smoothness,
+ exponent,
+ randomness,
+ voronoi_metric,
+ &distance_out,
+ &color_out,
+ &position_out);
+ }
break;
-#endif
case NODE_VORONOI_F2:
voronoi_f2_3d(coord,
exponent,
@@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
break;
}
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
case 4: {
- float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
- float4 position_out_4d;
- switch (voronoi_feature) {
- case NODE_VORONOI_F1:
- voronoi_f1_4d(coord_4d,
- exponent,
- randomness,
- voronoi_metric,
- &distance_out,
- &color_out,
- &position_out_4d);
- break;
- case NODE_VORONOI_SMOOTH_F1:
- voronoi_smooth_f1_4d(coord_4d,
- smoothness,
- exponent,
- randomness,
- voronoi_metric,
- &distance_out,
- &color_out,
- &position_out_4d);
- break;
- case NODE_VORONOI_F2:
- voronoi_f2_4d(coord_4d,
- exponent,
- randomness,
- voronoi_metric,
- &distance_out,
- &color_out,
- &position_out_4d);
- break;
- case NODE_VORONOI_DISTANCE_TO_EDGE:
- voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
- break;
- case NODE_VORONOI_N_SPHERE_RADIUS:
- voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
- break;
- default:
- kernel_assert(0);
+ if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+ float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+ float4 position_out_4d;
+ switch (voronoi_feature) {
+ case NODE_VORONOI_F1:
+ voronoi_f1_4d(coord_4d,
+ exponent,
+ randomness,
+ voronoi_metric,
+ &distance_out,
+ &color_out,
+ &position_out_4d);
+ break;
+ case NODE_VORONOI_SMOOTH_F1:
+ voronoi_smooth_f1_4d(coord_4d,
+ smoothness,
+ exponent,
+ randomness,
+ voronoi_metric,
+ &distance_out,
+ &color_out,
+ &position_out_4d);
+ break;
+ case NODE_VORONOI_F2:
+ voronoi_f2_4d(coord_4d,
+ exponent,
+ randomness,
+ voronoi_metric,
+ &distance_out,
+ &color_out,
+ &position_out_4d);
+ break;
+ case NODE_VORONOI_DISTANCE_TO_EDGE:
+ voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+ break;
+ case NODE_VORONOI_N_SPHERE_RADIUS:
+ voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+ break;
+ default:
+ kernel_assert(0);
+ }
+ position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+ position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+ w_out = position_out_4d.w;
}
- position_out_4d = safe_divide_float4_float(position_out_4d, scale);
- position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
- w_out = position_out_4d.w;
break;
}
-#endif
default:
kernel_assert(0);
}
@@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
stack_store_float(stack, w_out_stack_offset, w_out);
if (stack_valid(radius_out_stack_offset))
stack_store_float(stack, radius_out_stack_offset, radius_out);
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 4bc14f82382..78b75405356 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN
/* TODO(sergey): Think of making it more generic volume-type attribute
* sampler.
*/
-ccl_device void svm_node_tex_voxel(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_voxel(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
uint co_offset, density_out_offset, color_out_offset, space;
svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
@@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel(
else {
kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
Transform tfm;
- tfm.x = read_node_float(kg, offset);
- tfm.y = read_node_float(kg, offset);
- tfm.z = read_node_float(kg, offset);
+ tfm.x = read_node_float(kg, &offset);
+ tfm.y = read_node_float(kg, &offset);
+ tfm.z = read_node_float(kg, &offset);
co = transform_point(&tfm, co);
}
@@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel(
stack_store_float(stack, density_out_offset, r.w);
if (stack_valid(color_out_offset))
stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index c4763475b47..00f980c16df 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
}
}
-ccl_device void svm_node_tex_wave(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_wave(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
{
- uint4 node2 = read_node(kg, offset);
- uint4 node3 = read_node(kg, offset);
+ uint4 node2 = read_node(kg, &offset);
+ uint4 node3 = read_node(kg, &offset);
/* RNA properties */
uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
@@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave(
stack_store_float(stack, fac_offset, f);
if (stack_valid(color_offset))
stack_store_float3(stack, color_offset, make_float3(f, f, f));
+ return offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index d6144802559..fba8aa63d31 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = {
{0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
{0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
-ccl_device void svm_node_wavelength(
- KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device_noinline void svm_node_wavelength(
+ const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
{
float lambda_nm = stack_load_float(stack, wavelength);
float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f); // scaled 0..80
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
index b30d85acaec..0306d2e7b9c 100644
--- a/intern/cycles/kernel/svm/svm_white_noise.h
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -16,13 +16,12 @@
CCL_NAMESPACE_BEGIN
-ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
- ShaderData *sd,
- float *stack,
- uint dimensions,
- uint inputs_stack_offsets,
- uint ouptuts_stack_offsets,
- int *offset)
+ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint dimensions,
+ uint inputs_stack_offsets,
+ uint ouptuts_stack_offsets)
{
uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 49158bd86d5..7ec913789d2 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN
/* Wireframe Node */
ccl_device_inline float wireframe(
- KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
+ const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
{
#ifdef __HAIR__
if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
@@ -88,7 +88,10 @@ ccl_device_inline float wireframe(
return 0.0f;
}
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg,
+ ShaderData *sd,
+ float *stack,
+ uint4 node)
{
uint in_size = node.y;
uint out_fac = node.z;
@@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
int pixel_size = (int)use_pixel_size;
/* Calculate wireframe */
-#ifdef __SPLIT_KERNEL__
- /* TODO(sergey): This is because sd is actually a global space,
- * which makes it difficult to re-use same wireframe() function.
- *
- * With OpenCL 2.0 it's possible to avoid this change, but for until
- * then we'll be living with such an exception.
- */
- float3 P = sd->P;
- float f = wireframe(kg, sd, size, pixel_size, &P);
-#else
float f = wireframe(kg, sd, size, pixel_size, &sd->P);
-#endif
/* TODO(sergey): Think of faster way to calculate derivatives. */
if (bump_offset == NODE_BUMP_OFFSET_DX) {
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index feead27c5ca..6edb5261b32 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -32,10 +32,10 @@ set(SRC
camera.cpp
colorspace.cpp
constant_fold.cpp
- coverage.cpp
denoising.cpp
film.cpp
geometry.cpp
+ gpu_display.cpp
graph.cpp
hair.cpp
image.cpp
@@ -54,6 +54,7 @@ set(SRC
object.cpp
osl.cpp
particles.cpp
+ pass.cpp
curves.cpp
scene.cpp
session.cpp
@@ -76,10 +77,10 @@ set(SRC_HEADERS
camera.h
colorspace.h
constant_fold.h
- coverage.h
denoising.h
film.h
geometry.h
+ gpu_display.h
graph.h
hair.h
image.h
@@ -95,6 +96,7 @@ set(SRC_HEADERS
object.h
osl.h
particles.h
+ pass.h
procedural.h
curves.h
scene.h
@@ -111,6 +113,7 @@ set(SRC_HEADERS
set(LIB
cycles_bvh
cycles_device
+ cycles_integrator
cycles_subd
cycles_util
)
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index b925e755434..ae6290ac27b 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -34,11 +34,7 @@ NODE_DEFINE(Background)
{
NodeType *type = NodeType::add("background", create);
- SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
- SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
-
SOCKET_BOOLEAN(use_shader, "Use Shader", true);
- SOCKET_BOOLEAN(use_ao, "Use AO", false);
SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY);
SOCKET_BOOLEAN(transparent, "Transparent", false);
@@ -80,10 +76,6 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
/* set shader index and transparent option */
KernelBackground *kbackground = &dscene->data.background;
- kbackground->ao_factor = (use_ao) ? ao_factor : 0.0f;
- kbackground->ao_bounces_factor = ao_factor;
- kbackground->ao_distance = ao_distance;
-
kbackground->transparent = transparent;
kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader);
@@ -138,10 +130,6 @@ void Background::tag_update(Scene *scene)
* and to avoid doing unnecessary updates anywhere else. */
tag_use_shader_modified();
}
-
- if (ao_factor_is_modified() || use_ao_is_modified()) {
- scene->integrator->tag_update(scene, Integrator::BACKGROUND_AO_MODIFIED);
- }
}
Shader *Background::get_shader(const Scene *scene)
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index e89ffbc2445..2f7ef0f7737 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -32,11 +32,7 @@ class Background : public Node {
public:
NODE_DECLARE
- NODE_SOCKET_API(float, ao_factor)
- NODE_SOCKET_API(float, ao_distance)
-
NODE_SOCKET_API(bool, use_shader)
- NODE_SOCKET_API(bool, use_ao)
NODE_SOCKET_API(uint, visibility)
NODE_SOCKET_API(Shader *, shader)
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 317a3937cab..54e496caed6 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -26,58 +26,8 @@
CCL_NAMESPACE_BEGIN
-static int aa_samples(Scene *scene, Object *object, ShaderEvalType type)
-{
- if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
- return 1;
- }
- else if (type == SHADER_EVAL_NORMAL) {
- /* Only antialias normal if mesh has bump mapping. */
- if (object->get_geometry()) {
- foreach (Node *node, object->get_geometry()->get_used_shaders()) {
- Shader *shader = static_cast<Shader *>(node);
- if (shader->has_bump) {
- return scene->integrator->get_aa_samples();
- }
- }
- }
-
- return 1;
- }
- else {
- return scene->integrator->get_aa_samples();
- }
-}
-
-/* Keep it synced with kernel_bake.h logic */
-static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter)
-{
- const int component_flags = pass_filter &
- (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
-
- switch (type) {
- case SHADER_EVAL_AO:
- return BAKE_FILTER_AO;
- case SHADER_EVAL_SHADOW:
- return BAKE_FILTER_DIRECT;
- case SHADER_EVAL_DIFFUSE:
- return BAKE_FILTER_DIFFUSE | component_flags;
- case SHADER_EVAL_GLOSSY:
- return BAKE_FILTER_GLOSSY | component_flags;
- case SHADER_EVAL_TRANSMISSION:
- return BAKE_FILTER_TRANSMISSION | component_flags;
- case SHADER_EVAL_COMBINED:
- return pass_filter;
- default:
- return 0;
- }
-}
-
BakeManager::BakeManager()
{
- type = SHADER_EVAL_BAKE;
- pass_filter = 0;
-
need_update_ = true;
}
@@ -85,32 +35,14 @@ BakeManager::~BakeManager()
{
}
-bool BakeManager::get_baking()
+bool BakeManager::get_baking() const
{
return !object_name.empty();
}
-void BakeManager::set(Scene *scene,
- const std::string &object_name_,
- ShaderEvalType type_,
- int pass_filter_)
+void BakeManager::set(Scene *scene, const std::string &object_name_)
{
object_name = object_name_;
- type = type_;
- pass_filter = shader_type_to_pass_filter(type_, pass_filter_);
-
- Pass::add(PASS_BAKE_PRIMITIVE, scene->passes);
- Pass::add(PASS_BAKE_DIFFERENTIAL, scene->passes);
-
- if (type == SHADER_EVAL_UV) {
- /* force UV to be available */
- Pass::add(PASS_UV, scene->passes);
- }
-
- /* force use_light_pass to be true if we bake more than just colors */
- if (pass_filter & ~BAKE_FILTER_COLOR) {
- Pass::add(PASS_LIGHT, scene->passes);
- }
/* create device and update scene */
scene->film->tag_modified();
@@ -127,29 +59,29 @@ void BakeManager::device_update(Device * /*device*/,
if (!need_update())
return;
- scoped_callback_timer timer([scene](double time) {
- if (scene->update_stats) {
- scene->update_stats->bake.times.add_entry({"device_update", time});
- }
- });
-
- KernelIntegrator *kintegrator = &dscene->data.integrator;
KernelBake *kbake = &dscene->data.bake;
+ memset(kbake, 0, sizeof(*kbake));
- kbake->type = type;
- kbake->pass_filter = pass_filter;
-
- int object_index = 0;
- foreach (Object *object, scene->objects) {
- const Geometry *geom = object->get_geometry();
- if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
- kbake->object_index = object_index;
- kbake->tri_offset = geom->prim_offset;
- kintegrator->aa_samples = aa_samples(scene, object, type);
- break;
- }
+ if (!object_name.empty()) {
+ scoped_callback_timer timer([scene](double time) {
+ if (scene->update_stats) {
+ scene->update_stats->bake.times.add_entry({"device_update", time});
+ }
+ });
+
+ kbake->use = true;
- object_index++;
+ int object_index = 0;
+ foreach (Object *object, scene->objects) {
+ const Geometry *geom = object->get_geometry();
+ if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
+ kbake->object_index = object_index;
+ kbake->tri_offset = geom->prim_offset;
+ break;
+ }
+
+ object_index++;
+ }
}
need_update_ = false;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 655b9b1cf7e..39e504490c2 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -30,8 +30,8 @@ class BakeManager {
BakeManager();
~BakeManager();
- void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter);
- bool get_baking();
+ void set(Scene *scene, const std::string &object_name);
+ bool get_baking() const;
void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
void device_free(Device *device, DeviceScene *dscene);
@@ -42,8 +42,6 @@ class BakeManager {
private:
bool need_update_;
- ShaderEvalType type;
- int pass_filter;
std::string object_name;
};
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fcfad58995e..1882510cd70 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -28,537 +28,335 @@
CCL_NAMESPACE_BEGIN
-/* Buffer Params */
+/* --------------------------------------------------------------------
+ * Convert part information to an index of `BufferParams::pass_offset_`.
+ */
-BufferParams::BufferParams()
+static int pass_type_mode_to_index(PassType pass_type, PassMode mode)
{
- width = 0;
- height = 0;
-
- full_x = 0;
- full_y = 0;
- full_width = 0;
- full_height = 0;
+ int index = static_cast<int>(pass_type) * 2;
- denoising_data_pass = false;
- denoising_clean_pass = false;
- denoising_prefiltered_pass = false;
+ if (mode == PassMode::DENOISED) {
+ ++index;
+ }
- Pass::add(PASS_COMBINED, passes);
+ return index;
}
-void BufferParams::get_offset_stride(int &offset, int &stride)
+static int pass_to_index(const BufferPass &pass)
{
- offset = -(full_x + full_y * width);
- stride = width;
+ return pass_type_mode_to_index(pass.type, pass.mode);
}
-bool BufferParams::modified(const BufferParams &params)
-{
- return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
- height == params.height && full_width == params.full_width &&
- full_height == params.full_height && Pass::equals(passes, params.passes) &&
- denoising_data_pass == params.denoising_data_pass &&
- denoising_clean_pass == params.denoising_clean_pass &&
- denoising_prefiltered_pass == params.denoising_prefiltered_pass);
-}
+/* --------------------------------------------------------------------
+ * Buffer pass.
+ */
-int BufferParams::get_passes_size()
+NODE_DEFINE(BufferPass)
{
- int size = 0;
+ NodeType *type = NodeType::add("buffer_pass", create);
- for (size_t i = 0; i < passes.size(); i++)
- size += passes[i].components;
+ const NodeEnum *pass_type_enum = Pass::get_type_enum();
+ const NodeEnum *pass_mode_enum = Pass::get_mode_enum();
- if (denoising_data_pass) {
- size += DENOISING_PASS_SIZE_BASE;
- if (denoising_clean_pass)
- size += DENOISING_PASS_SIZE_CLEAN;
- if (denoising_prefiltered_pass)
- size += DENOISING_PASS_SIZE_PREFILTERED;
- }
+ SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+ SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+ SOCKET_STRING(name, "Name", ustring());
+ SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
- return align_up(size, 4);
-}
+ SOCKET_INT(offset, "Offset", -1);
-int BufferParams::get_denoising_offset()
-{
- int offset = 0;
-
- for (size_t i = 0; i < passes.size(); i++)
- offset += passes[i].components;
-
- return offset;
+ return type;
}
-int BufferParams::get_denoising_prefiltered_offset()
+BufferPass::BufferPass() : Node(get_node_type())
{
- assert(denoising_prefiltered_pass);
-
- int offset = get_denoising_offset();
-
- offset += DENOISING_PASS_SIZE_BASE;
- if (denoising_clean_pass) {
- offset += DENOISING_PASS_SIZE_CLEAN;
- }
-
- return offset;
}
-/* Render Buffer Task */
-
-RenderTile::RenderTile()
+BufferPass::BufferPass(const Pass *scene_pass)
+ : Node(get_node_type()),
+ type(scene_pass->get_type()),
+ mode(scene_pass->get_mode()),
+ name(scene_pass->get_name()),
+ include_albedo(scene_pass->get_include_albedo())
{
- x = 0;
- y = 0;
- w = 0;
- h = 0;
-
- sample = 0;
- start_sample = 0;
- num_samples = 0;
- resolution = 0;
-
- offset = 0;
- stride = 0;
-
- buffer = 0;
-
- buffers = NULL;
- stealing_state = NO_STEALING;
}
-/* Render Buffers */
-
-RenderBuffers::RenderBuffers(Device *device)
- : buffer(device, "RenderBuffers", MEM_READ_WRITE),
- map_neighbor_copied(false),
- render_time(0.0f)
+PassInfo BufferPass::get_info() const
{
+ return Pass::get_info(type, include_albedo);
}
-RenderBuffers::~RenderBuffers()
-{
- buffer.free();
-}
+/* --------------------------------------------------------------------
+ * Buffer Params.
+ */
-void RenderBuffers::reset(BufferParams &params_)
+NODE_DEFINE(BufferParams)
{
- params = params_;
-
- /* re-allocate buffer */
- buffer.alloc(params.width * params.get_passes_size(), params.height);
- buffer.zero_to_device();
+ NodeType *type = NodeType::add("buffer_params", create);
+
+ SOCKET_INT(width, "Width", 0);
+ SOCKET_INT(height, "Height", 0);
+
+ SOCKET_INT(full_x, "Full X", 0);
+ SOCKET_INT(full_y, "Full Y", 0);
+ SOCKET_INT(full_width, "Full Width", 0);
+ SOCKET_INT(full_height, "Full Height", 0);
+
+ SOCKET_STRING(layer, "Layer", ustring());
+ SOCKET_STRING(view, "View", ustring());
+ SOCKET_INT(samples, "Samples", 0);
+ SOCKET_FLOAT(exposure, "Exposure", 1.0f);
+ SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+ SOCKET_BOOLEAN(use_transparent_background, "Transparent Background", false);
+
+ /* Notes:
+ * - Skip passes since they do not follow typical container socket definition.
+ * Might look into covering those as a socket in the future.
+ *
+ * - Skip offset, stride, and pass stride since those can be delivered from the passes and
+ * rest of the sockets. */
+
+ return type;
}
-void RenderBuffers::zero()
+BufferParams::BufferParams() : Node(get_node_type())
{
- buffer.zero_to_device();
+ reset_pass_offset();
}
-bool RenderBuffers::copy_from_device()
+void BufferParams::update_passes()
{
- if (!buffer.device_pointer)
- return false;
-
- buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height);
-
- return true;
-}
-
-bool RenderBuffers::get_denoising_pass_rect(
- int type, float exposure, int sample, int components, float *pixels)
-{
- if (buffer.data() == NULL) {
- return false;
- }
-
- float scale = 1.0f;
- float alpha_scale = 1.0f / sample;
- if (type == DENOISING_PASS_PREFILTERED_COLOR || type == DENOISING_PASS_CLEAN ||
- type == DENOISING_PASS_PREFILTERED_INTENSITY) {
- scale *= exposure;
- }
- else if (type == DENOISING_PASS_PREFILTERED_VARIANCE) {
- scale *= exposure * exposure * (sample - 1);
- }
+ update_offset_stride();
+ reset_pass_offset();
+
+ pass_stride = 0;
+ for (const BufferPass &pass : passes) {
+ if (pass.offset != PASS_UNUSED) {
+ const int index = pass_to_index(pass);
+ if (pass_offset_[index] == PASS_UNUSED) {
+ pass_offset_[index] = pass_stride;
+ }
- int offset;
- if (type == DENOISING_PASS_CLEAN) {
- /* The clean pass isn't changed by prefiltering, so we use the original one there. */
- offset = type + params.get_denoising_offset();
- scale /= sample;
- }
- else if (params.denoising_prefiltered_pass) {
- offset = type + params.get_denoising_prefiltered_offset();
- }
- else {
- switch (type) {
- case DENOISING_PASS_PREFILTERED_DEPTH:
- offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
- break;
- case DENOISING_PASS_PREFILTERED_NORMAL:
- offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
- break;
- case DENOISING_PASS_PREFILTERED_ALBEDO:
- offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
- break;
- case DENOISING_PASS_PREFILTERED_COLOR:
- /* If we're not saving the prefiltering result, return the original noisy pass. */
- offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
- break;
- default:
- return false;
+ pass_stride += pass.get_info().num_components;
}
- scale /= sample;
}
+}
- int pass_stride = params.get_passes_size();
- int size = params.width * params.height;
+void BufferParams::update_passes(const vector<Pass *> &scene_passes)
+{
+ passes.clear();
- float *in = buffer.data() + offset;
+ pass_stride = 0;
+ for (const Pass *scene_pass : scene_passes) {
+ BufferPass buffer_pass(scene_pass);
- if (components == 1) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- pixels[0] = in[0] * scale;
+ if (scene_pass->is_written()) {
+ buffer_pass.offset = pass_stride;
+ pass_stride += scene_pass->get_info().num_components;
}
- }
- else if (components == 3) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- pixels[0] = in[0] * scale;
- pixels[1] = in[1] * scale;
- pixels[2] = in[2] * scale;
- }
- }
- else if (components == 4) {
- /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
- assert(params.passes[0].type == PASS_COMBINED);
- float *in_combined = buffer.data();
-
- for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
- float3 val = make_float3(in[0], in[1], in[2]);
- if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) {
- /* Remove highlight compression from the image. */
- val = color_highlight_uncompress(val);
- }
- pixels[0] = val.x * scale;
- pixels[1] = val.y * scale;
- pixels[2] = val.z * scale;
- pixels[3] = saturate(in_combined[3] * alpha_scale);
+ else {
+ buffer_pass.offset = PASS_UNUSED;
}
- }
- else {
- return false;
+
+ passes.emplace_back(std::move(buffer_pass));
}
- return true;
+ update_passes();
}
-bool RenderBuffers::get_pass_rect(
- const string &name, float exposure, int sample, int components, float *pixels)
+void BufferParams::reset_pass_offset()
{
- if (buffer.data() == NULL) {
- return false;
+ for (int i = 0; i < kNumPassOffsets; ++i) {
+ pass_offset_[i] = PASS_UNUSED;
}
+}
- float *sample_count = NULL;
- if (name == "Combined") {
- int sample_offset = 0;
- for (size_t j = 0; j < params.passes.size(); j++) {
- Pass &pass = params.passes[j];
- if (pass.type != PASS_SAMPLE_COUNT) {
- sample_offset += pass.components;
- continue;
- }
- else {
- sample_count = buffer.data() + sample_offset;
- break;
- }
- }
+int BufferParams::get_pass_offset(PassType pass_type, PassMode mode) const
+{
+ if (pass_type == PASS_NONE || pass_type == PASS_UNUSED) {
+ return PASS_UNUSED;
}
- int pass_offset = 0;
-
- for (size_t j = 0; j < params.passes.size(); j++) {
- Pass &pass = params.passes[j];
+ const int index = pass_type_mode_to_index(pass_type, mode);
+ return pass_offset_[index];
+}
- /* Pass is identified by both type and name, multiple of the same type
- * may exist with a different name. */
- if (pass.name != name) {
- pass_offset += pass.components;
- continue;
+const BufferPass *BufferParams::find_pass(string_view name) const
+{
+ for (const BufferPass &pass : passes) {
+ if (pass.name == name) {
+ return &pass;
}
+ }
- PassType type = pass.type;
-
- float *in = buffer.data() + pass_offset;
- int pass_stride = params.get_passes_size();
-
- float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
- float scale_exposure = (pass.exposure) ? scale * exposure : scale;
-
- int size = params.width * params.height;
+ return nullptr;
+}
- if (components == 1 && type == PASS_RENDER_TIME) {
- /* Render time is not stored by kernel, but measured per tile. */
- float val = (float)(1000.0 * render_time / (params.width * params.height * sample));
- for (int i = 0; i < size; i++, pixels++) {
- pixels[0] = val;
- }
- }
- else if (components == 1) {
- assert(pass.components == components);
-
- /* Scalar */
- if (type == PASS_DEPTH) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
- pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
- }
- }
- else if (type == PASS_MIST) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
- pixels[0] = saturate(f * scale_exposure);
- }
- }
- else {
- for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
- float f = *in;
- pixels[0] = f * scale_exposure;
- }
- }
- }
- else if (components == 3) {
- assert(pass.components == 4);
-
- /* RGBA */
- if (type == PASS_SHADOW) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
- float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
- pixels[0] = f.x * invw;
- pixels[1] = f.y * invw;
- pixels[2] = f.z * invw;
- }
- }
- else if (pass.divide_type != PASS_NONE) {
- /* RGB lighting passes that need to divide out color */
- pass_offset = 0;
- for (size_t k = 0; k < params.passes.size(); k++) {
- Pass &color_pass = params.passes[k];
- if (color_pass.type == pass.divide_type)
- break;
- pass_offset += color_pass.components;
- }
-
- float *in_divide = buffer.data() + pass_offset;
-
- for (int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
- float3 f = make_float3(in[0], in[1], in[2]);
- float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
-
- f = safe_divide_even_color(f * exposure, f_divide);
-
- pixels[0] = f.x;
- pixels[1] = f.y;
- pixels[2] = f.z;
- }
- }
- else {
- /* RGB/vector */
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
- float3 f = make_float3(in[0], in[1], in[2]);
-
- pixels[0] = f.x * scale_exposure;
- pixels[1] = f.y * scale_exposure;
- pixels[2] = f.z * scale_exposure;
- }
- }
- }
- else if (components == 4) {
- assert(pass.components == components);
-
- /* RGBA */
- if (type == PASS_SHADOW) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
- float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
- pixels[0] = f.x * invw;
- pixels[1] = f.y * invw;
- pixels[2] = f.z * invw;
- pixels[3] = 1.0f;
- }
- }
- else if (type == PASS_MOTION) {
- /* need to normalize by number of samples accumulated for motion */
- pass_offset = 0;
- for (size_t k = 0; k < params.passes.size(); k++) {
- Pass &color_pass = params.passes[k];
- if (color_pass.type == PASS_MOTION_WEIGHT)
- break;
- pass_offset += color_pass.components;
- }
-
- float *in_weight = buffer.data() + pass_offset;
-
- for (int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
- float w = in_weight[0];
- float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
-
- pixels[0] = f.x * invw;
- pixels[1] = f.y * invw;
- pixels[2] = f.z * invw;
- pixels[3] = f.w * invw;
- }
- }
- else if (type == PASS_CRYPTOMATTE) {
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
- /* x and z contain integer IDs, don't rescale them.
- y and w contain matte weights, they get scaled. */
- pixels[0] = f.x;
- pixels[1] = f.y * scale;
- pixels[2] = f.z;
- pixels[3] = f.w * scale;
- }
- }
- else {
- for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
- if (sample_count && sample_count[i * pass_stride] < 0.0f) {
- scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
- scale_exposure = (pass.exposure) ? scale * exposure : scale;
- }
-
- float4 f = make_float4(in[0], in[1], in[2], in[3]);
-
- pixels[0] = f.x * scale_exposure;
- pixels[1] = f.y * scale_exposure;
- pixels[2] = f.z * scale_exposure;
-
- /* Clamp since alpha might be > 1.0 due to Russian roulette. */
- pixels[3] = saturate(f.w * scale);
- }
- }
+const BufferPass *BufferParams::find_pass(PassType type, PassMode mode) const
+{
+ for (const BufferPass &pass : passes) {
+ if (pass.type == type && pass.mode == mode) {
+ return &pass;
}
-
- return true;
}
- return false;
+ return nullptr;
}
-bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels, int samples)
+const BufferPass *BufferParams::get_actual_display_pass(PassType type, PassMode mode) const
{
- if (buffer.data() == NULL) {
- return false;
- }
-
- int pass_offset = 0;
+ const BufferPass *pass = find_pass(type, mode);
+ return get_actual_display_pass(pass);
+}
- for (size_t j = 0; j < params.passes.size(); j++) {
- Pass &pass = params.passes[j];
+const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass) const
+{
+ if (!pass) {
+ return nullptr;
+ }
- if (pass.type != type) {
- pass_offset += pass.components;
- continue;
+ if (pass->type == PASS_COMBINED) {
+ const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode);
+ if (shadow_catcher_matte_pass) {
+ pass = shadow_catcher_matte_pass;
}
+ }
- float *out = buffer.data() + pass_offset;
- int pass_stride = params.get_passes_size();
- int size = params.width * params.height;
-
- assert(pass.components == components);
+ return pass;
+}
- for (int i = 0; i < size; i++, out += pass_stride, pixels += components) {
- if (pass.filter) {
- /* Scale by the number of samples, inverse of what we do in get_pass_rect.
- * A better solution would be to remove the need for set_pass_rect entirely,
- * and change baking to bake multiple objects in a tile at once. */
- for (int j = 0; j < components; j++) {
- out[j] = pixels[j] * samples;
- }
- }
- else {
- /* For non-filtered passes just straight copy, these may contain non-float data. */
- memcpy(out, pixels, sizeof(float) * components);
- }
- }
+void BufferParams::update_offset_stride()
+{
+ offset = -(full_x + full_y * width);
+ stride = width;
+}
+bool BufferParams::modified(const BufferParams &other) const
+{
+ if (!(width == other.width && height == other.height && full_x == other.full_x &&
+ full_y == other.full_y && full_width == other.full_width &&
+ full_height == other.full_height && offset == other.offset && stride == other.stride &&
+ pass_stride == other.pass_stride && layer == other.layer && view == other.view &&
+ exposure == other.exposure &&
+ use_approximate_shadow_catcher == other.use_approximate_shadow_catcher &&
+ use_transparent_background == other.use_transparent_background)) {
return true;
}
- return false;
+ return !(passes == other.passes);
}
-/* Display Buffer */
+/* --------------------------------------------------------------------
+ * Render Buffers.
+ */
-DisplayBuffer::DisplayBuffer(Device *device, bool linear)
- : draw_width(0),
- draw_height(0),
- transparent(true), /* todo: determine from background */
- half_float(linear),
- rgba_byte(device, "display buffer byte"),
- rgba_half(device, "display buffer half")
+RenderBuffers::RenderBuffers(Device *device) : buffer(device, "RenderBuffers", MEM_READ_WRITE)
{
}
-DisplayBuffer::~DisplayBuffer()
+RenderBuffers::~RenderBuffers()
{
- rgba_byte.free();
- rgba_half.free();
+ buffer.free();
}
-void DisplayBuffer::reset(BufferParams &params_)
+void RenderBuffers::reset(const BufferParams &params_)
{
- draw_width = 0;
- draw_height = 0;
+ DCHECK(params_.pass_stride != -1);
params = params_;
- /* allocate display pixels */
- if (half_float) {
- rgba_half.alloc_to_device(params.width, params.height);
- }
- else {
- rgba_byte.alloc_to_device(params.width, params.height);
- }
+ /* re-allocate buffer */
+ buffer.alloc(params.width * params.pass_stride, params.height);
}
-void DisplayBuffer::draw_set(int width, int height)
+void RenderBuffers::zero()
{
- assert(width <= params.width && height <= params.height);
+ buffer.zero_to_device();
+}
- draw_width = width;
- draw_height = height;
+bool RenderBuffers::copy_from_device()
+{
+ DCHECK(params.pass_stride != -1);
+
+ if (!buffer.device_pointer)
+ return false;
+
+ buffer.copy_from_device(0, params.width * params.pass_stride, params.height);
+
+ return true;
}
-void DisplayBuffer::draw(Device *device, const DeviceDrawParams &draw_params)
+void RenderBuffers::copy_to_device()
{
- if (draw_width != 0 && draw_height != 0) {
- device_memory &rgba = (half_float) ? (device_memory &)rgba_half : (device_memory &)rgba_byte;
-
- device->draw_pixels(rgba,
- 0,
- draw_width,
- draw_height,
- params.width,
- params.height,
- params.full_x,
- params.full_y,
- params.full_width,
- params.full_height,
- transparent,
- draw_params);
- }
+ buffer.copy_to_device();
}
-bool DisplayBuffer::draw_ready()
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+ const BufferParams &dst_params,
+ const RenderBuffers *src,
+ const BufferParams &src_params,
+ const size_t src_offset)
{
- return (draw_width != 0 && draw_height != 0);
+ DCHECK_EQ(dst_params.width, src_params.width);
+ /* TODO(sergey): More sanity checks to avoid buffer overrun. */
+
+ /* Create a map of pass offsets to be copied.
+ * Assume offsets are different to allow copying passes between buffers with different set of
+ * passes. */
+
+ struct {
+ int dst_offset;
+ int src_offset;
+ } pass_offsets[PASS_NUM];
+
+ int num_passes = 0;
+
+ for (int i = 0; i < PASS_NUM; ++i) {
+ const PassType pass_type = static_cast<PassType>(i);
+
+ const int dst_pass_offset = dst_params.get_pass_offset(pass_type, PassMode::DENOISED);
+ if (dst_pass_offset == PASS_UNUSED) {
+ continue;
+ }
+
+ const int src_pass_offset = src_params.get_pass_offset(pass_type, PassMode::DENOISED);
+ if (src_pass_offset == PASS_UNUSED) {
+ continue;
+ }
+
+ pass_offsets[num_passes].dst_offset = dst_pass_offset;
+ pass_offsets[num_passes].src_offset = src_pass_offset;
+ ++num_passes;
+ }
+
+ /* Copy passes. */
+ /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */
+
+ const int64_t dst_width = dst_params.width;
+ const int64_t dst_height = dst_params.height;
+ const int64_t dst_pass_stride = dst_params.pass_stride;
+ const int64_t dst_num_pixels = dst_width * dst_height;
+
+ const int64_t src_pass_stride = src_params.pass_stride;
+ const int64_t src_offset_in_floats = src_offset * src_pass_stride;
+
+ const float *src_pixel = src->buffer.data() + src_offset_in_floats;
+ float *dst_pixel = dst->buffer.data();
+
+ for (int i = 0; i < dst_num_pixels;
+ ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride) {
+ for (int pass_offset_idx = 0; pass_offset_idx < num_passes; ++pass_offset_idx) {
+ const int dst_pass_offset = pass_offsets[pass_offset_idx].dst_offset;
+ const int src_pass_offset = pass_offsets[pass_offset_idx].src_offset;
+
+ /* TODO(sergey): Support non-RGBA passes. */
+ dst_pixel[dst_pass_offset + 0] = src_pixel[src_pass_offset + 0];
+ dst_pixel[dst_pass_offset + 1] = src_pixel[src_pass_offset + 1];
+ dst_pixel[dst_pass_offset + 2] = src_pixel[src_pass_offset + 2];
+ dst_pixel[dst_pass_offset + 3] = src_pixel[src_pass_offset + 3];
+ }
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 4ffc628bb52..184ac7197af 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -18,8 +18,8 @@
#define __BUFFERS_H__
#include "device/device_memory.h"
-
-#include "render/film.h"
+#include "graph/node.h"
+#include "render/pass.h"
#include "kernel/kernel_types.h"
@@ -34,170 +34,157 @@ class Device;
struct DeviceDrawParams;
struct float4;
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferPass : public Node {
+ public:
+ NODE_DECLARE
+
+ PassType type = PASS_NONE;
+ PassMode mode = PassMode::NOISY;
+ ustring name;
+ bool include_albedo = false;
+
+ int offset = -1;
+
+ BufferPass();
+ explicit BufferPass(const Pass *scene_pass);
+
+ BufferPass(BufferPass &&other) noexcept = default;
+ BufferPass(const BufferPass &other) = default;
+
+ BufferPass &operator=(BufferPass &&other) = default;
+ BufferPass &operator=(const BufferPass &other) = default;
+
+ ~BufferPass() = default;
+
+ PassInfo get_info() const;
+
+ inline bool operator==(const BufferPass &other) const
+ {
+ return type == other.type && mode == other.mode && name == other.name &&
+ include_albedo == other.include_albedo && offset == other.offset;
+ }
+ inline bool operator!=(const BufferPass &other) const
+ {
+ return !(*this == other);
+ }
+};
+
/* Buffer Parameters
* Size of render buffer and how it fits in the full image (border render). */
-class BufferParams {
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferParams : public Node {
public:
- /* width/height of the physical buffer */
- int width;
- int height;
-
- /* offset into and width/height of the full buffer */
- int full_x;
- int full_y;
- int full_width;
- int full_height;
-
- /* passes */
- vector<Pass> passes;
- bool denoising_data_pass;
- /* If only some light path types should be target, an additional pass is needed. */
- bool denoising_clean_pass;
- /* When we're prefiltering the passes during rendering, we need to keep both the
- * original and the prefiltered data around because neighboring tiles might still
- * need the original data. */
- bool denoising_prefiltered_pass;
-
- /* functions */
- BufferParams();
+ NODE_DECLARE
- void get_offset_stride(int &offset, int &stride);
- bool modified(const BufferParams &params);
- int get_passes_size();
- int get_denoising_offset();
- int get_denoising_prefiltered_offset();
-};
+ /* Width/height of the physical buffer. */
+ int width = 0;
+ int height = 0;
-/* Render Buffers */
+ /* Offset into and width/height of the full buffer. */
+ int full_x = 0;
+ int full_y = 0;
+ int full_width = 0;
+ int full_height = 0;
-class RenderBuffers {
- public:
- /* buffer parameters */
- BufferParams params;
+ /* Runtime fields, only valid after `update_passes()` or `update_offset_stride()`. */
+ int offset = -1, stride = -1;
- /* float buffer */
- device_vector<float> buffer;
- bool map_neighbor_copied;
- double render_time;
+ /* Runtime fields, only valid after `update_passes()`. */
+ int pass_stride = -1;
- explicit RenderBuffers(Device *device);
- ~RenderBuffers();
+ /* Properties which are used for accessing buffer pixels outside of scene graph. */
+ vector<BufferPass> passes;
+ ustring layer;
+ ustring view;
+ int samples = 0;
+ float exposure = 1.0f;
+ bool use_approximate_shadow_catcher = false;
+ bool use_transparent_background = false;
- void reset(BufferParams &params);
- void zero();
+ BufferParams();
- bool copy_from_device();
- bool get_pass_rect(
- const string &name, float exposure, int sample, int components, float *pixels);
- bool get_denoising_pass_rect(
- int offset, float exposure, int sample, int components, float *pixels);
- bool set_pass_rect(PassType type, int components, float *pixels, int samples);
-};
+ BufferParams(BufferParams &&other) noexcept = default;
+ BufferParams(const BufferParams &other) = default;
-/* Display Buffer
- *
- * The buffer used for drawing during render, filled by converting the render
- * buffers to byte of half float storage */
+ BufferParams &operator=(BufferParams &&other) = default;
+ BufferParams &operator=(const BufferParams &other) = default;
-class DisplayBuffer {
- public:
- /* buffer parameters */
- BufferParams params;
- /* dimensions for how much of the buffer is actually ready for display.
- * with progressive render we can be using only a subset of the buffer.
- * if these are zero, it means nothing can be drawn yet */
- int draw_width, draw_height;
- /* draw alpha channel? */
- bool transparent;
- /* use half float? */
- bool half_float;
- /* byte buffer for converted result */
- device_pixels<uchar4> rgba_byte;
- device_pixels<half4> rgba_half;
-
- DisplayBuffer(Device *device, bool linear = false);
- ~DisplayBuffer();
-
- void reset(BufferParams &params);
-
- void draw_set(int width, int height);
- void draw(Device *device, const DeviceDrawParams &draw_params);
- bool draw_ready();
-};
+ ~BufferParams() = default;
-/* Render Tile
- * Rendering task on a buffer */
+ /* Pre-calculate all fields which depends on the passes.
+ *
+ * When the scene passes are given, the buffer passes will be created from them and stored in
+ * this params, and then params are updated for those passes.
+ * The `update_passes()` without parameters updates offsets and strides which are stored outside
+ * of the passes. */
+ void update_passes();
+ void update_passes(const vector<Pass *> &scene_passes);
-class RenderTile {
- public:
- typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task;
+ /* Returns PASS_UNUSED if there is no such pass in the buffer. */
+ int get_pass_offset(PassType type, PassMode mode = PassMode::NOISY) const;
- Task task;
- int x, y, w, h;
- int start_sample;
- int num_samples;
- int sample;
- int resolution;
- int offset;
- int stride;
- int tile_index;
+ /* Returns nullptr if pass with given name does not exist. */
+ const BufferPass *find_pass(string_view name) const;
+ const BufferPass *find_pass(PassType type, PassMode mode = PassMode::NOISY) const;
- device_ptr buffer;
- int device_size;
+ /* Get display pass from its name.
+ * Will do special logic to replace combined pass with shadow catcher matte. */
+ const BufferPass *get_actual_display_pass(PassType type, PassMode mode = PassMode::NOISY) const;
+ const BufferPass *get_actual_display_pass(const BufferPass *pass) const;
- typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState;
- StealingState stealing_state;
+ void update_offset_stride();
- RenderBuffers *buffers;
+ bool modified(const BufferParams &other) const;
- RenderTile();
+ protected:
+ void reset_pass_offset();
- int4 bounds() const
- {
- return make_int4(x, /* xmin */
- y, /* ymin */
- x + w, /* xmax */
- y + h); /* ymax */
- }
+ /* Multiplied by 2 to be able to store noisy and denoised pass types. */
+ static constexpr int kNumPassOffsets = PASS_NUM * 2;
+
+ /* Indexed by an index derived from pass type and mode, indicates offset of the corresponding
+ * pass in the buffer.
+ * If there are multiple passes with same type and mode contains lowest offset of all of them. */
+ int pass_offset_[kNumPassOffsets];
};
-/* Render Tile Neighbors
- * Set of neighboring tiles used for denoising. Tile order:
- * 0 1 2
- * 3 4 5
- * 6 7 8 */
+/* Render Buffers */
-class RenderTileNeighbors {
+class RenderBuffers {
public:
- static const int SIZE = 9;
- static const int CENTER = 4;
+ /* buffer parameters */
+ BufferParams params;
- RenderTile tiles[SIZE];
- RenderTile target;
+ /* float buffer */
+ device_vector<float> buffer;
- RenderTileNeighbors(const RenderTile &center)
- {
- tiles[CENTER] = center;
- }
+ explicit RenderBuffers(Device *device);
+ ~RenderBuffers();
- int4 bounds() const
- {
- return make_int4(tiles[3].x, /* xmin */
- tiles[1].y, /* ymin */
- tiles[5].x + tiles[5].w, /* xmax */
- tiles[7].y + tiles[7].h); /* ymax */
- }
+ void reset(const BufferParams &params);
+ void zero();
- void set_bounds_from_center()
- {
- tiles[3].x = tiles[CENTER].x;
- tiles[1].y = tiles[CENTER].y;
- tiles[5].x = tiles[CENTER].x + tiles[CENTER].w;
- tiles[7].y = tiles[CENTER].y + tiles[CENTER].h;
- }
+ bool copy_from_device();
+ void copy_to_device();
};
+/* Copy denoised passes form source to destination.
+ *
+ * Buffer parameters are provided explicitly, allowing to copy pixels between render buffers which
+ * content corresponds to a render result at a non-unit resolution divider.
+ *
+ * `src_offset` allows to offset source pixel index which is used when a fraction of the source
+ * buffer is to be copied.
+ *
+ * Copy happens of the number of pixels in the destination. */
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+ const BufferParams &dst_params,
+ const RenderBuffers *src,
+ const BufferParams &src_params,
+ const size_t src_offset = 0);
+
CCL_NAMESPACE_END
#endif /* __BUFFERS_H__ */
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 327f166f9d8..8b69c971991 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -33,9 +33,9 @@
/* needed for calculating differentials */
// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
#include "kernel/kernel_projection.h"
#include "kernel/kernel_differential.h"
#include "kernel/kernel_montecarlo.h"
@@ -169,7 +169,6 @@ Camera::Camera() : Node(get_node_type())
width = 1024;
height = 512;
- resolution = 1;
use_perspective_motion = false;
@@ -455,7 +454,6 @@ void Camera::update(Scene *scene)
/* render size */
kcam->width = width;
kcam->height = height;
- kcam->resolution = resolution;
/* store differentials */
kcam->dx = float3_to_float4(dx);
@@ -776,9 +774,11 @@ float Camera::world_to_raster_size(float3 P)
&ray);
#endif
- differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);
+ /* TODO: would it help to use more accurate differentials here? */
+ differential3 dP;
+ differential_transfer_compact(&dP, ray.dP, ray.D, ray.dD, ray.D, dist);
- return max(len(ray.dP.dx), len(ray.dP.dy));
+ return max(len(dP.dx), len(dP.dy));
}
return res;
@@ -789,12 +789,11 @@ bool Camera::use_motion() const
return motion.size() > 1;
}
-void Camera::set_screen_size_and_resolution(int width_, int height_, int resolution_)
+void Camera::set_screen_size(int width_, int height_)
{
- if (width_ != width || height_ != height || resolution_ != resolution) {
+ if (width_ != width || height_ != height) {
width = width_;
height = height_;
- resolution = resolution_;
tag_modified();
}
}
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 5abb4750764..cb8ecac1a7e 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -199,7 +199,6 @@ class Camera : public Node {
private:
int width;
int height;
- int resolution;
public:
/* functions */
@@ -225,7 +224,7 @@ class Camera : public Node {
int motion_step(float time) const;
bool use_motion() const;
- void set_screen_size_and_resolution(int width_, int height_, int resolution_);
+ void set_screen_size(int width_, int height_);
private:
/* Private utility functions. */
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
deleted file mode 100644
index 99d4daa6961..00000000000
--- a/intern/cycles/render/coverage.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "render/coverage.h"
-#include "render/buffers.h"
-
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_id_passes.h"
-
-#include "util/util_map.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool crypomatte_comp(const pair<float, float> &i, const pair<float, float> j)
-{
- return i.first > j.first;
-}
-
-void Coverage::finalize()
-{
- int pass_offset = 0;
- if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
- finalize_buffer(coverage_object, pass_offset);
- pass_offset += kernel_data.film.cryptomatte_depth * 4;
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
- finalize_buffer(coverage_material, pass_offset);
- pass_offset += kernel_data.film.cryptomatte_depth * 4;
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
- finalize_buffer(coverage_asset, pass_offset);
- }
-}
-
-void Coverage::init_path_trace()
-{
- kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL;
-
- if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
- if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
- coverage_object.clear();
- coverage_object.resize(tile.w * tile.h);
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
- coverage_material.clear();
- coverage_material.resize(tile.w * tile.h);
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
- coverage_asset.clear();
- coverage_asset.resize(tile.w * tile.h);
- }
- }
-}
-
-void Coverage::init_pixel(int x, int y)
-{
- if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
- const int pixel_index = tile.w * (y - tile.y) + x - tile.x;
- if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
- kg->coverage_object = &coverage_object[pixel_index];
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
- kg->coverage_material = &coverage_material[pixel_index];
- }
- if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
- kg->coverage_asset = &coverage_asset[pixel_index];
- }
- }
-}
-
-void Coverage::finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
- if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
- flatten_buffer(coverage, pass_offset);
- }
- else {
- sort_buffer(pass_offset);
- }
-}
-
-void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
- /* Sort the coverage map and write it to the output */
- int pixel_index = 0;
- int pass_stride = tile.buffers->params.get_passes_size();
- for (int y = 0; y < tile.h; ++y) {
- for (int x = 0; x < tile.w; ++x) {
- const CoverageMap &pixel = coverage[pixel_index];
- if (!pixel.empty()) {
- /* buffer offset */
- int index = x + y * tile.stride;
- float *buffer = (float *)tile.buffer + index * pass_stride;
-
- /* sort the cryptomatte pixel */
- vector<pair<float, float>> sorted_pixel;
- for (CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) {
- sorted_pixel.push_back(std::make_pair(it->second, it->first));
- }
- sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp);
- int num_slots = 2 * (kernel_data.film.cryptomatte_depth);
- if (sorted_pixel.size() > num_slots) {
- float leftover = 0.0f;
- for (vector<pair<float, float>>::iterator it = sorted_pixel.begin() + num_slots;
- it != sorted_pixel.end();
- ++it) {
- leftover += it->first;
- }
- sorted_pixel[num_slots - 1].first += leftover;
- }
- int limit = min(num_slots, sorted_pixel.size());
- for (int i = 0; i < limit; ++i) {
- kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
- 2 * (kernel_data.film.cryptomatte_depth),
- sorted_pixel[i].second,
- sorted_pixel[i].first);
- }
- }
- ++pixel_index;
- }
- }
-}
-
-void Coverage::sort_buffer(const int pass_offset)
-{
- /* Sort the coverage map and write it to the output */
- int pass_stride = tile.buffers->params.get_passes_size();
- for (int y = 0; y < tile.h; ++y) {
- for (int x = 0; x < tile.w; ++x) {
- /* buffer offset */
- int index = x + y * tile.stride;
- float *buffer = (float *)tile.buffer + index * pass_stride;
- kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
- 2 * (kernel_data.film.cryptomatte_depth));
- }
- }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
deleted file mode 100644
index 12182c614da..00000000000
--- a/intern/cycles/render/coverage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __COVERAGE_H__
-#define __COVERAGE_H__
-
-#include "util/util_map.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct KernelGlobals;
-class RenderTile;
-
-typedef unordered_map<float, float> CoverageMap;
-
-class Coverage {
- public:
- Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
- {
- }
- void init_path_trace();
- void init_pixel(int x, int y);
- void finalize();
-
- private:
- vector<CoverageMap> coverage_object;
- vector<CoverageMap> coverage_material;
- vector<CoverageMap> coverage_asset;
- KernelGlobals *kg;
- RenderTile &tile;
- void finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset);
- void flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset);
- void sort_buffer(const int pass_offset);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __COVERAGE_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index ddbe7484800..bcf8d3fa204 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -16,15 +16,17 @@
#include "render/denoising.h"
-#include "kernel/filter/filter_defines.h"
+#if 0
-#include "util/util_foreach.h"
-#include "util/util_map.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_time.h"
+# include "kernel/filter/filter_defines.h"
-#include <OpenImageIO/filesystem.h>
+# include "util/util_foreach.h"
+# include "util/util_map.h"
+# include "util/util_system.h"
+# include "util/util_task.h"
+# include "util/util_time.h"
+
+# include <OpenImageIO/filesystem.h>
CCL_NAMESPACE_BEGIN
@@ -225,7 +227,7 @@ bool DenoiseImageLayer::match_channels(int neighbor,
/* Denoise Task */
DenoiseTask::DenoiseTask(Device *device,
- Denoiser *denoiser,
+ DenoiserPipeline *denoiser,
int frame,
const vector<int> &neighbor_frames)
: denoiser(denoiser),
@@ -386,7 +388,6 @@ void DenoiseTask::create_task(DeviceTask &task)
task.denoising = denoiser->params;
task.denoising.type = DENOISER_NLM;
task.denoising.use = true;
- task.denoising.store_passes = false;
task.denoising_from_render = false;
task.denoising_frames.resize(neighbor_frames.size());
@@ -863,7 +864,7 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)
/* File pattern handling and outer loop over frames */
-Denoiser::Denoiser(DeviceInfo &device_info)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
{
samples_override = 0;
tile_size = make_int2(64, 64);
@@ -876,18 +877,16 @@ Denoiser::Denoiser(DeviceInfo &device_info)
/* Initialize device. */
device = Device::create(device_info, stats, profiler, true);
- DeviceRequestedFeatures req;
- req.use_denoising = true;
- device->load_kernels(req);
+ device->load_kernels(KERNEL_FEATURE_DENOISING);
}
-Denoiser::~Denoiser()
+DenoiserPipeline::~DenoiserPipeline()
{
delete device;
TaskScheduler::exit();
}
-bool Denoiser::run()
+bool DenoiserPipeline::run()
{
assert(input.size() == output.size());
@@ -931,3 +930,5 @@ bool Denoiser::run()
}
CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index c1b4d0a5596..097cc570d06 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -17,27 +17,31 @@
#ifndef __DENOISING_H__
#define __DENOISING_H__
-#include "device/device.h"
-#include "device/device_denoising.h"
+#if 0
-#include "render/buffers.h"
+/* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
+ * parameters. Currently it is an annoying mixture of terms used interchangeably. */
-#include "util/util_string.h"
-#include "util/util_unique_ptr.h"
-#include "util/util_vector.h"
+# include "device/device.h"
-#include <OpenImageIO/imageio.h>
+# include "render/buffers.h"
+
+# include "util/util_string.h"
+# include "util/util_unique_ptr.h"
+# include "util/util_vector.h"
+
+# include <OpenImageIO/imageio.h>
OIIO_NAMESPACE_USING
CCL_NAMESPACE_BEGIN
-/* Denoiser */
+/* Denoiser pipeline */
-class Denoiser {
+class DenoiserPipeline {
public:
- Denoiser(DeviceInfo &device_info);
- ~Denoiser();
+ DenoiserPipeline(DeviceInfo &device_info);
+ ~DenoiserPipeline();
bool run();
@@ -155,7 +159,10 @@ class DenoiseImage {
class DenoiseTask {
public:
- DenoiseTask(Device *device, Denoiser *denoiser, int frame, const vector<int> &neighbor_frames);
+ DenoiseTask(Device *device,
+ DenoiserPipeline *denoiser,
+ int frame,
+ const vector<int> &neighbor_frames);
~DenoiseTask();
/* Task stages */
@@ -168,7 +175,7 @@ class DenoiseTask {
protected:
/* Denoiser parameters and device */
- Denoiser *denoiser;
+ DenoiserPipeline *denoiser;
Device *device;
/* Frame number to be denoised */
@@ -204,4 +211,6 @@ class DenoiseTask {
CCL_NAMESPACE_END
+#endif
+
#endif /* __DENOISING_H__ */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 5df396394c4..8e14b338bd3 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -16,9 +16,12 @@
#include "render/film.h"
#include "device/device.h"
+#include "render/background.h"
+#include "render/bake.h"
#include "render/camera.h"
#include "render/integrator.h"
#include "render/mesh.h"
+#include "render/object.h"
#include "render/scene.h"
#include "render/stats.h"
#include "render/tables.h"
@@ -31,261 +34,6 @@
CCL_NAMESPACE_BEGIN
-/* Pass */
-
-static bool compare_pass_order(const Pass &a, const Pass &b)
-{
- if (a.components == b.components)
- return (a.type < b.type);
- return (a.components > b.components);
-}
-
-static NodeEnum *get_pass_type_enum()
-{
- static NodeEnum pass_type_enum;
- pass_type_enum.insert("combined", PASS_COMBINED);
- pass_type_enum.insert("depth", PASS_DEPTH);
- pass_type_enum.insert("normal", PASS_NORMAL);
- pass_type_enum.insert("uv", PASS_UV);
- pass_type_enum.insert("object_id", PASS_OBJECT_ID);
- pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
- pass_type_enum.insert("motion", PASS_MOTION);
- pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
- pass_type_enum.insert("render_time", PASS_RENDER_TIME);
- pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
- pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
- pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
- pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
- pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
- pass_type_enum.insert("mist", PASS_MIST);
- pass_type_enum.insert("emission", PASS_EMISSION);
- pass_type_enum.insert("background", PASS_BACKGROUND);
- pass_type_enum.insert("ambient_occlusion", PASS_AO);
- pass_type_enum.insert("shadow", PASS_SHADOW);
- pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
- pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
- pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
- pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
- pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
- pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
- pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
- pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
- pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
- pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
- pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
- pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
- pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
-
- return &pass_type_enum;
-}
-
-NODE_DEFINE(Pass)
-{
- NodeType *type = NodeType::add("pass", create);
-
- NodeEnum *pass_type_enum = get_pass_type_enum();
- SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
- SOCKET_STRING(name, "Name", ustring());
-
- return type;
-}
-
-Pass::Pass() : Node(get_node_type())
-{
-}
-
-void Pass::add(PassType type, vector<Pass> &passes, const char *name)
-{
- for (size_t i = 0; i < passes.size(); i++) {
- if (passes[i].type != type) {
- continue;
- }
-
- /* An empty name is used as a placeholder to signal that any pass of
- * that type is fine (because the content always is the same).
- * This is important to support divide_type: If the pass that has a
- * divide_type is added first, a pass for divide_type with an empty
- * name will be added. Then, if a matching pass with a name is later
- * requested, the existing placeholder will be renamed to that.
- * If the divide_type is explicitly allocated with a name first and
- * then again as part of another pass, the second one will just be
- * skipped because that type already exists. */
-
- /* If no name is specified, any pass of the correct type will match. */
- if (name == NULL) {
- return;
- }
-
- /* If we already have a placeholder pass, rename that one. */
- if (passes[i].name.empty()) {
- passes[i].name = name;
- return;
- }
-
- /* If neither existing nor requested pass have placeholder name, they
- * must match. */
- if (name == passes[i].name) {
- return;
- }
- }
-
- Pass pass;
-
- pass.type = type;
- pass.filter = true;
- pass.exposure = false;
- pass.divide_type = PASS_NONE;
- if (name) {
- pass.name = name;
- }
-
- switch (type) {
- case PASS_NONE:
- pass.components = 0;
- break;
- case PASS_COMBINED:
- pass.components = 4;
- pass.exposure = true;
- break;
- case PASS_DEPTH:
- pass.components = 1;
- pass.filter = false;
- break;
- case PASS_MIST:
- pass.components = 1;
- break;
- case PASS_NORMAL:
- pass.components = 4;
- break;
- case PASS_UV:
- pass.components = 4;
- break;
- case PASS_MOTION:
- pass.components = 4;
- pass.divide_type = PASS_MOTION_WEIGHT;
- break;
- case PASS_MOTION_WEIGHT:
- pass.components = 1;
- break;
- case PASS_OBJECT_ID:
- case PASS_MATERIAL_ID:
- pass.components = 1;
- pass.filter = false;
- break;
-
- case PASS_EMISSION:
- case PASS_BACKGROUND:
- pass.components = 4;
- pass.exposure = true;
- break;
- case PASS_AO:
- pass.components = 4;
- break;
- case PASS_SHADOW:
- pass.components = 4;
- pass.exposure = false;
- break;
- case PASS_LIGHT:
- /* This isn't a real pass, used by baking to see whether
- * light data is needed or not.
- *
- * Set components to 0 so pass sort below happens in a
- * determined way.
- */
- pass.components = 0;
- break;
- case PASS_RENDER_TIME:
- /* This pass is handled entirely on the host side. */
- pass.components = 0;
- break;
-
- case PASS_DIFFUSE_COLOR:
- case PASS_GLOSSY_COLOR:
- case PASS_TRANSMISSION_COLOR:
- pass.components = 4;
- break;
- case PASS_DIFFUSE_DIRECT:
- case PASS_DIFFUSE_INDIRECT:
- pass.components = 4;
- pass.exposure = true;
- pass.divide_type = PASS_DIFFUSE_COLOR;
- break;
- case PASS_GLOSSY_DIRECT:
- case PASS_GLOSSY_INDIRECT:
- pass.components = 4;
- pass.exposure = true;
- pass.divide_type = PASS_GLOSSY_COLOR;
- break;
- case PASS_TRANSMISSION_DIRECT:
- case PASS_TRANSMISSION_INDIRECT:
- pass.components = 4;
- pass.exposure = true;
- pass.divide_type = PASS_TRANSMISSION_COLOR;
- break;
- case PASS_VOLUME_DIRECT:
- case PASS_VOLUME_INDIRECT:
- pass.components = 4;
- pass.exposure = true;
- break;
- case PASS_CRYPTOMATTE:
- pass.components = 4;
- break;
- case PASS_ADAPTIVE_AUX_BUFFER:
- pass.components = 4;
- break;
- case PASS_SAMPLE_COUNT:
- pass.components = 1;
- pass.exposure = false;
- break;
- case PASS_AOV_COLOR:
- pass.components = 4;
- break;
- case PASS_AOV_VALUE:
- pass.components = 1;
- break;
- case PASS_BAKE_PRIMITIVE:
- case PASS_BAKE_DIFFERENTIAL:
- pass.components = 4;
- pass.exposure = false;
- pass.filter = false;
- break;
- default:
- assert(false);
- break;
- }
-
- passes.push_back(pass);
-
- /* Order from by components, to ensure alignment so passes with size 4
- * come first and then passes with size 1. Note this must use stable sort
- * so cryptomatte passes remain in the right order. */
- stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
-
- if (pass.divide_type != PASS_NONE)
- Pass::add(pass.divide_type, passes);
-}
-
-bool Pass::equals(const vector<Pass> &A, const vector<Pass> &B)
-{
- if (A.size() != B.size())
- return false;
-
- for (int i = 0; i < A.size(); i++)
- if (A[i].type != B[i].type || A[i].name != B[i].name)
- return false;
-
- return true;
-}
-
-bool Pass::contains(const vector<Pass> &passes, PassType type)
-{
- for (size_t i = 0; i < passes.size(); i++)
- if (passes[i].type == type)
- return true;
-
- return false;
-}
-
/* Pixel Filter */
static float filter_func_box(float /*v*/, float /*width*/)
@@ -368,17 +116,11 @@ NODE_DEFINE(Film)
SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f);
SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f);
- SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
- SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
- SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
- SOCKET_INT(denoising_flags, "Denoising Flags", 0);
- SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
-
- SOCKET_BOOLEAN(use_light_visibility, "Use Light Visibility", false);
-
- NodeEnum *pass_type_enum = get_pass_type_enum();
+ const NodeEnum *pass_type_enum = Pass::get_type_enum();
SOCKET_ENUM(display_pass, "Display Pass", *pass_type_enum, PASS_COMBINED);
+ SOCKET_BOOLEAN(show_active_pixels, "Show Active Pixels", false);
+
static NodeEnum cryptomatte_passes_enum;
cryptomatte_passes_enum.insert("none", CRYPT_NONE);
cryptomatte_passes_enum.insert("object", CRYPT_OBJECT);
@@ -389,15 +131,13 @@ NODE_DEFINE(Film)
SOCKET_INT(cryptomatte_depth, "Cryptomatte Depth", 0);
+ SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+
return type;
}
-Film::Film() : Node(get_node_type())
+Film::Film() : Node(get_node_type()), filter_table_offset_(TABLE_OFFSET_INVALID)
{
- use_light_visibility = false;
- filter_table_offset = TABLE_OFFSET_INVALID;
- cryptomatte_passes = CRYPT_NONE;
- display_pass = PASS_COMBINED;
}
Film::~Film()
@@ -406,7 +146,8 @@ Film::~Film()
void Film::add_default(Scene *scene)
{
- Pass::add(PASS_COMBINED, scene->passes);
+ Pass *pass = scene->create_node<Pass>();
+ pass->set_type(PASS_COMBINED);
}
void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -426,50 +167,77 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
/* update __data */
kfilm->exposure = exposure;
+ kfilm->pass_alpha_threshold = pass_alpha_threshold;
kfilm->pass_flag = 0;
- kfilm->display_pass_stride = -1;
- kfilm->display_pass_components = 0;
- kfilm->display_divide_pass_stride = -1;
- kfilm->use_display_exposure = false;
- kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED);
+ kfilm->use_approximate_shadow_catcher = get_use_approximate_shadow_catcher();
kfilm->light_pass_flag = 0;
kfilm->pass_stride = 0;
- kfilm->use_light_pass = use_light_visibility;
- kfilm->pass_aov_value_num = 0;
- kfilm->pass_aov_color_num = 0;
+
+ /* Mark with PASS_UNUSED to avoid mask test in the kernel. */
+ kfilm->pass_background = PASS_UNUSED;
+ kfilm->pass_emission = PASS_UNUSED;
+ kfilm->pass_ao = PASS_UNUSED;
+ kfilm->pass_diffuse_direct = PASS_UNUSED;
+ kfilm->pass_diffuse_indirect = PASS_UNUSED;
+ kfilm->pass_glossy_direct = PASS_UNUSED;
+ kfilm->pass_glossy_indirect = PASS_UNUSED;
+ kfilm->pass_transmission_direct = PASS_UNUSED;
+ kfilm->pass_transmission_indirect = PASS_UNUSED;
+ kfilm->pass_volume_direct = PASS_UNUSED;
+ kfilm->pass_volume_indirect = PASS_UNUSED;
+ kfilm->pass_volume_direct = PASS_UNUSED;
+ kfilm->pass_volume_indirect = PASS_UNUSED;
+ kfilm->pass_shadow = PASS_UNUSED;
+
+ /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
+ kfilm->pass_denoising_normal = PASS_UNUSED;
+ kfilm->pass_denoising_albedo = PASS_UNUSED;
+ kfilm->pass_sample_count = PASS_UNUSED;
+ kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
+ kfilm->pass_shadow_catcher = PASS_UNUSED;
+ kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
+ kfilm->pass_shadow_catcher_matte = PASS_UNUSED;
bool have_cryptomatte = false;
+ bool have_aov_color = false;
+ bool have_aov_value = false;
for (size_t i = 0; i < scene->passes.size(); i++) {
- Pass &pass = scene->passes[i];
+ const Pass *pass = scene->passes[i];
- if (pass.type == PASS_NONE) {
+ if (pass->get_type() == PASS_NONE || !pass->is_written()) {
+ continue;
+ }
+
+ if (pass->get_mode() == PassMode::DENOISED) {
+ /* Generally we only storing offsets of the noisy passes. The display pass is an exception
+ * since it is a read operation and not a write. */
+ kfilm->pass_stride += pass->get_info().num_components;
continue;
}
/* Can't do motion pass if no motion vectors are available. */
- if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) {
+ if (pass->get_type() == PASS_MOTION || pass->get_type() == PASS_MOTION_WEIGHT) {
if (scene->need_motion() != Scene::MOTION_PASS) {
- kfilm->pass_stride += pass.components;
+ kfilm->pass_stride += pass->get_info().num_components;
continue;
}
}
- int pass_flag = (1 << (pass.type % 32));
- if (pass.type <= PASS_CATEGORY_MAIN_END) {
- kfilm->pass_flag |= pass_flag;
- }
- else if (pass.type <= PASS_CATEGORY_LIGHT_END) {
- kfilm->use_light_pass = 1;
+ const int pass_flag = (1 << (pass->get_type() % 32));
+ if (pass->get_type() <= PASS_CATEGORY_LIGHT_END) {
kfilm->light_pass_flag |= pass_flag;
}
+ else if (pass->get_type() <= PASS_CATEGORY_DATA_END) {
+ kfilm->pass_flag |= pass_flag;
+ }
else {
- assert(pass.type <= PASS_CATEGORY_BAKE_END);
+ assert(pass->get_type() <= PASS_CATEGORY_BAKE_END);
}
- switch (pass.type) {
+ switch (pass->get_type()) {
case PASS_COMBINED:
kfilm->pass_combined = kfilm->pass_stride;
break;
@@ -479,6 +247,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
case PASS_NORMAL:
kfilm->pass_normal = kfilm->pass_stride;
break;
+ case PASS_POSITION:
+ kfilm->pass_position = kfilm->pass_stride;
+ break;
+ case PASS_ROUGHNESS:
+ kfilm->pass_roughness = kfilm->pass_stride;
+ break;
case PASS_UV:
kfilm->pass_uv = kfilm->pass_stride;
break;
@@ -511,9 +285,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_shadow = kfilm->pass_stride;
break;
- case PASS_LIGHT:
- break;
-
case PASS_DIFFUSE_COLOR:
kfilm->pass_diffuse_color = kfilm->pass_stride;
break;
@@ -563,78 +334,56 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride;
have_cryptomatte = true;
break;
+
+ case PASS_DENOISING_NORMAL:
+ kfilm->pass_denoising_normal = kfilm->pass_stride;
+ break;
+ case PASS_DENOISING_ALBEDO:
+ kfilm->pass_denoising_albedo = kfilm->pass_stride;
+ break;
+
+ case PASS_SHADOW_CATCHER:
+ kfilm->pass_shadow_catcher = kfilm->pass_stride;
+ break;
+ case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+ kfilm->pass_shadow_catcher_sample_count = kfilm->pass_stride;
+ break;
+ case PASS_SHADOW_CATCHER_MATTE:
+ kfilm->pass_shadow_catcher_matte = kfilm->pass_stride;
+ break;
+
case PASS_ADAPTIVE_AUX_BUFFER:
kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
break;
case PASS_SAMPLE_COUNT:
kfilm->pass_sample_count = kfilm->pass_stride;
break;
+
case PASS_AOV_COLOR:
- if (kfilm->pass_aov_color_num == 0) {
+ if (!have_aov_color) {
kfilm->pass_aov_color = kfilm->pass_stride;
+ have_aov_color = true;
}
- kfilm->pass_aov_color_num++;
break;
case PASS_AOV_VALUE:
- if (kfilm->pass_aov_value_num == 0) {
+ if (!have_aov_value) {
kfilm->pass_aov_value = kfilm->pass_stride;
+ have_aov_value = true;
}
- kfilm->pass_aov_value_num++;
break;
default:
assert(false);
break;
}
- if (pass.type == display_pass) {
- kfilm->display_pass_stride = kfilm->pass_stride;
- kfilm->display_pass_components = pass.components;
- kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
- }
- else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
- pass.type == PASS_GLOSSY_COLOR) {
- kfilm->display_divide_pass_stride = kfilm->pass_stride;
- }
-
- kfilm->pass_stride += pass.components;
- }
-
- kfilm->pass_denoising_data = 0;
- kfilm->pass_denoising_clean = 0;
- kfilm->denoising_flags = 0;
- if (denoising_data_pass) {
- kfilm->pass_denoising_data = kfilm->pass_stride;
- kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
- kfilm->denoising_flags = denoising_flags;
- if (denoising_clean_pass) {
- kfilm->pass_denoising_clean = kfilm->pass_stride;
- kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
- kfilm->use_light_pass = 1;
- }
- if (denoising_prefiltered_pass) {
- kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
- }
- }
-
- kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
-
- /* When displaying the normal/uv pass in the viewport we need to disable
- * transparency.
- *
- * We also don't need to perform light accumulations. Later we want to optimize this to suppress
- * light calculations. */
- if (display_pass == PASS_NORMAL || display_pass == PASS_UV) {
- kfilm->use_light_pass = 0;
- }
- else {
- kfilm->pass_alpha_threshold = pass_alpha_threshold;
+ kfilm->pass_stride += pass->get_info().num_components;
}
/* update filter table */
vector<float> table = filter_table(filter_type, filter_width);
- scene->lookup_tables->remove_table(&filter_table_offset);
- filter_table_offset = scene->lookup_tables->add_table(dscene, table);
- kfilm->filter_table_offset = (int)filter_table_offset;
+ scene->lookup_tables->remove_table(&filter_table_offset_);
+ filter_table_offset_ = scene->lookup_tables->add_table(dscene, table);
+ kfilm->filter_table_offset = (int)filter_table_offset_;
/* mist pass parameters */
kfilm->mist_start = mist_start;
@@ -644,79 +393,298 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->cryptomatte_passes = cryptomatte_passes;
kfilm->cryptomatte_depth = cryptomatte_depth;
- pass_stride = kfilm->pass_stride;
- denoising_data_offset = kfilm->pass_denoising_data;
- denoising_clean_offset = kfilm->pass_denoising_clean;
-
clear_modified();
}
void Film::device_free(Device * /*device*/, DeviceScene * /*dscene*/, Scene *scene)
{
- scene->lookup_tables->remove_table(&filter_table_offset);
+ scene->lookup_tables->remove_table(&filter_table_offset_);
}
-void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
+int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
{
- if (Pass::contains(scene->passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
- scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
+ int offset_color = 0, offset_value = 0;
+ foreach (const Pass *pass, scene->passes) {
+ if (pass->get_name() == name) {
+ if (pass->get_type() == PASS_AOV_VALUE) {
+ is_color = false;
+ return offset_value;
+ }
+ else if (pass->get_type() == PASS_AOV_COLOR) {
+ is_color = true;
+ return offset_color;
+ }
+ }
+
+ if (pass->get_type() == PASS_AOV_VALUE) {
+ offset_value += pass->get_info().num_components;
+ }
+ else if (pass->get_type() == PASS_AOV_COLOR) {
+ offset_color += pass->get_info().num_components;
+ }
+ }
+
+ return -1;
+}
+
+void Film::update_passes(Scene *scene, bool add_sample_count_pass)
+{
+ const Background *background = scene->background;
+ const BakeManager *bake_manager = scene->bake_manager;
+ const ObjectManager *object_manager = scene->object_manager;
+ Integrator *integrator = scene->integrator;
+
+ if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+ return;
+ }
+
+ /* Remove auto generated passes and recreate them. */
+ remove_auto_passes(scene);
+
+ /* Display pass for viewport. */
+ const PassType display_pass = get_display_pass();
+ add_auto_pass(scene, display_pass);
+
+ /* Assumption is that a combined pass always exists for now, for example
+ * adaptive sampling is always based on a combined pass. But we should
+ * try to lift this limitation in the future for faster rendering of
+ * individual passes. */
+ if (display_pass != PASS_COMBINED) {
+ add_auto_pass(scene, PASS_COMBINED);
+ }
+
+ /* Create passes needed for adaptive sampling. */
+ const AdaptiveSampling adaptive_sampling = integrator->get_adaptive_sampling();
+ if (adaptive_sampling.use) {
+ add_auto_pass(scene, PASS_SAMPLE_COUNT);
+ add_auto_pass(scene, PASS_ADAPTIVE_AUX_BUFFER);
+ }
+
+ /* Create passes needed for denoising. */
+ const bool use_denoise = integrator->get_use_denoise();
+ if (use_denoise) {
+ if (integrator->get_use_denoise_pass_normal()) {
+ add_auto_pass(scene, PASS_DENOISING_NORMAL);
+ }
+ if (integrator->get_use_denoise_pass_albedo()) {
+ add_auto_pass(scene, PASS_DENOISING_ALBEDO);
+ }
+ }
+
+ /* Create passes for shadow catcher. */
+ if (scene->has_shadow_catcher()) {
+ const bool need_background = get_use_approximate_shadow_catcher() &&
+ !background->get_transparent();
+
+ add_auto_pass(scene, PASS_SHADOW_CATCHER);
+ add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+ add_auto_pass(scene, PASS_SHADOW_CATCHER_MATTE);
+
+ if (need_background) {
+ add_auto_pass(scene, PASS_BACKGROUND);
+ }
+ }
+ else if (Pass::contains(scene->passes, PASS_SHADOW_CATCHER)) {
+ add_auto_pass(scene, PASS_SHADOW_CATCHER);
+ add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+ }
+
+ const vector<Pass *> passes_immutable = scene->passes;
+ for (const Pass *pass : passes_immutable) {
+ const PassInfo info = pass->get_info();
+ /* Add utility passes needed to generate some light passes. */
+ if (info.divide_type != PASS_NONE) {
+ add_auto_pass(scene, info.divide_type);
+ }
+ if (info.direct_type != PASS_NONE) {
+ add_auto_pass(scene, info.direct_type);
+ }
+ if (info.indirect_type != PASS_NONE) {
+ add_auto_pass(scene, info.indirect_type);
+ }
+
+ /* NOTE: Enable all denoised passes when storage is requested.
+ * This way it is possible to tweak denoiser parameters later on. */
+ if (info.support_denoise && use_denoise) {
+ add_auto_pass(scene, pass->get_type(), PassMode::DENOISED);
+ }
+ }
+
+ if (bake_manager->get_baking()) {
+ add_auto_pass(scene, PASS_BAKE_PRIMITIVE, "BakePrimitive");
+ add_auto_pass(scene, PASS_BAKE_DIFFERENTIAL, "BakeDifferential");
+ }
+
+ if (add_sample_count_pass) {
+ if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) {
+ add_auto_pass(scene, PASS_SAMPLE_COUNT);
+ }
+ }
+
+ /* Remove duplicates and initialize internal pass info. */
+ finalize_passes(scene, use_denoise);
+ /* Flush scene updates. */
+ const bool have_uv_pass = Pass::contains(scene->passes, PASS_UV);
+ const bool have_motion_pass = Pass::contains(scene->passes, PASS_MOTION);
+ const bool have_ao_pass = Pass::contains(scene->passes, PASS_AO);
+
+ if (have_uv_pass != prev_have_uv_pass) {
+ scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
foreach (Shader *shader, scene->shaders)
shader->need_update_uvs = true;
}
- else if (Pass::contains(scene->passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
+ if (have_motion_pass != prev_have_motion_pass) {
scene->geometry_manager->tag_update(scene, GeometryManager::MOTION_PASS_NEEDED);
}
- else if (Pass::contains(scene->passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
+ if (have_ao_pass != prev_have_ao_pass) {
scene->integrator->tag_update(scene, Integrator::AO_PASS_MODIFIED);
}
- if (update_passes) {
- scene->passes = passes_;
+ prev_have_uv_pass = have_uv_pass;
+ prev_have_motion_pass = have_motion_pass;
+ prev_have_ao_pass = have_ao_pass;
+
+ tag_modified();
+
+ /* Debug logging. */
+ if (VLOG_IS_ON(2)) {
+ VLOG(2) << "Effective scene passes:";
+ for (const Pass *pass : scene->passes) {
+ VLOG(2) << "- " << *pass;
+ }
}
}
-int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
+void Film::add_auto_pass(Scene *scene, PassType type, const char *name)
{
- int num_color = 0, num_value = 0;
- foreach (const Pass &pass, scene->passes) {
- if (pass.type == PASS_AOV_COLOR) {
- num_color++;
- }
- else if (pass.type == PASS_AOV_VALUE) {
- num_value++;
+ add_auto_pass(scene, type, PassMode::NOISY, name);
+}
+
+void Film::add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name)
+{
+ Pass *pass = new Pass();
+ pass->set_type(type);
+ pass->set_mode(mode);
+ pass->set_name(ustring((name) ? name : ""));
+ pass->is_auto_ = true;
+
+ pass->set_owner(scene);
+ scene->passes.push_back(pass);
+}
+
+void Film::remove_auto_passes(Scene *scene)
+{
+ /* Remove all passes which were automatically created. */
+ vector<Pass *> new_passes;
+
+ for (Pass *pass : scene->passes) {
+ if (!pass->is_auto_) {
+ new_passes.push_back(pass);
}
else {
- continue;
- }
-
- if (pass.name == name) {
- is_color = (pass.type == PASS_AOV_COLOR);
- return (is_color ? num_color : num_value) - 1;
+ delete pass;
}
}
- return -1;
+ scene->passes = new_passes;
}
-int Film::get_pass_stride() const
+static bool compare_pass_order(const Pass *a, const Pass *b)
{
- return pass_stride;
-}
+ const int num_components_a = a->get_info().num_components;
+ const int num_components_b = b->get_info().num_components;
-int Film::get_denoising_data_offset() const
-{
- return denoising_data_offset;
+ if (num_components_a == num_components_b) {
+ return (a->get_type() < b->get_type());
+ }
+
+ return num_components_a > num_components_b;
}
-int Film::get_denoising_clean_offset() const
+void Film::finalize_passes(Scene *scene, const bool use_denoise)
{
- return denoising_clean_offset;
+ /* Remove duplicate passes. */
+ vector<Pass *> new_passes;
+
+ for (Pass *pass : scene->passes) {
+ /* Disable denoising on passes if denoising is disabled, or if the
+ * pass does not support it. */
+ pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() :
+ PassMode::NOISY);
+
+ /* Merge duplicate passes. */
+ bool duplicate_found = false;
+ for (Pass *new_pass : new_passes) {
+ /* If different type or denoising, don't merge. */
+ if (new_pass->get_type() != pass->get_type() || new_pass->get_mode() != pass->get_mode()) {
+ continue;
+ }
+
+ /* If both passes have a name and the names are different, don't merge.
+ * If either pass has a name, we'll use that name. */
+ if (!pass->get_name().empty() && !new_pass->get_name().empty() &&
+ pass->get_name() != new_pass->get_name()) {
+ continue;
+ }
+
+ if (!pass->get_name().empty() && new_pass->get_name().empty()) {
+ new_pass->set_name(pass->get_name());
+ }
+
+ new_pass->is_auto_ &= pass->is_auto_;
+ duplicate_found = true;
+ break;
+ }
+
+ if (!duplicate_found) {
+ new_passes.push_back(pass);
+ }
+ else {
+ delete pass;
+ }
+ }
+
+ /* Order from by components and type, This is required to for AOVs and cryptomatte passes,
+ * which the kernel assumes to be in order. Note this must use stable sort so cryptomatte
+ * passes remain in the right order. */
+ stable_sort(new_passes.begin(), new_passes.end(), compare_pass_order);
+
+ scene->passes = new_passes;
}
-size_t Film::get_filter_table_offset() const
+uint Film::get_kernel_features(const Scene *scene) const
{
- return filter_table_offset;
+ uint kernel_features = 0;
+
+ for (const Pass *pass : scene->passes) {
+ if (!pass->is_written()) {
+ continue;
+ }
+
+ const PassType pass_type = pass->get_type();
+ const PassMode pass_mode = pass->get_mode();
+
+ if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL ||
+ pass_type == PASS_DENOISING_ALBEDO) {
+ kernel_features |= KERNEL_FEATURE_DENOISING;
+ }
+
+ if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+ pass_type <= PASS_CATEGORY_LIGHT_END) {
+ kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
+
+ if (pass_type == PASS_SHADOW) {
+ kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+ }
+ }
+
+ if (pass_type == PASS_AO) {
+ kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
+ }
+ }
+
+ return kernel_features;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 462a7275491..5d327353361 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,6 +17,7 @@
#ifndef __FILM_H__
#define __FILM_H__
+#include "render/pass.h"
#include "util/util_string.h"
#include "util/util_vector.h"
@@ -38,36 +39,15 @@ typedef enum FilterType {
FILTER_NUM_TYPES,
} FilterType;
-class Pass : public Node {
- public:
- NODE_DECLARE
-
- Pass();
-
- PassType type;
- int components;
- bool filter;
- bool exposure;
- PassType divide_type;
- ustring name;
-
- static void add(PassType type, vector<Pass> &passes, const char *name = NULL);
- static bool equals(const vector<Pass> &A, const vector<Pass> &B);
- static bool contains(const vector<Pass> &passes, PassType);
-};
-
class Film : public Node {
public:
NODE_DECLARE
NODE_SOCKET_API(float, exposure)
- NODE_SOCKET_API(bool, denoising_data_pass)
- NODE_SOCKET_API(bool, denoising_clean_pass)
- NODE_SOCKET_API(bool, denoising_prefiltered_pass)
- NODE_SOCKET_API(int, denoising_flags)
NODE_SOCKET_API(float, pass_alpha_threshold)
NODE_SOCKET_API(PassType, display_pass)
+ NODE_SOCKET_API(bool, show_active_pixels)
NODE_SOCKET_API(FilterType, filter_type)
NODE_SOCKET_API(float, filter_width)
@@ -76,17 +56,18 @@ class Film : public Node {
NODE_SOCKET_API(float, mist_depth)
NODE_SOCKET_API(float, mist_falloff)
- NODE_SOCKET_API(bool, use_light_visibility)
NODE_SOCKET_API(CryptomatteType, cryptomatte_passes)
NODE_SOCKET_API(int, cryptomatte_depth)
- NODE_SOCKET_API(bool, use_adaptive_sampling)
+ /* Approximate shadow catcher pass into its matte pass, so that both artificial objects and
+ * shadows can be alpha-overed onto a backdrop. */
+ NODE_SOCKET_API(bool, use_approximate_shadow_catcher)
private:
- int pass_stride;
- int denoising_data_offset;
- int denoising_clean_offset;
- size_t filter_table_offset;
+ size_t filter_table_offset_;
+ bool prev_have_uv_pass = false;
+ bool prev_have_motion_pass = false;
+ bool prev_have_ao_pass = false;
public:
Film();
@@ -98,14 +79,20 @@ class Film : public Node {
void device_update(Device *device, DeviceScene *dscene, Scene *scene);
void device_free(Device *device, DeviceScene *dscene, Scene *scene);
- void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true);
-
int get_aov_offset(Scene *scene, string name, bool &is_color);
- int get_pass_stride() const;
- int get_denoising_data_offset() const;
- int get_denoising_clean_offset() const;
- size_t get_filter_table_offset() const;
+ /* Update passes so that they contain all passes required for the configured functionality.
+ *
+ * If `add_sample_count_pass` is true then the SAMPLE_COUNT pass is ensured to be added. */
+ void update_passes(Scene *scene, bool add_sample_count_pass);
+
+ uint get_kernel_features(const Scene *scene) const;
+
+ private:
+ void add_auto_pass(Scene *scene, PassType type, const char *name = nullptr);
+ void add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name = nullptr);
+ void remove_auto_passes(Scene *scene);
+ void finalize_passes(Scene *scene, const bool use_denoise);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
index 7ec1d2d9abb..6804a006fe6 100644
--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -215,6 +215,12 @@ void Geometry::compute_bvh(
msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
Object object;
+
+ /* Ensure all visibility bits are set at the geometry level BVH. In
+ * the object level BVH is where actual visibility is tested. */
+ object.set_is_shadow_catcher(true);
+ object.set_visibility(~0);
+
object.set_geometry(this);
vector<Geometry *> geometry;
@@ -315,7 +321,7 @@ void GeometryManager::update_osl_attributes(Device *device,
{
#ifdef WITH_OSL
/* for OSL, a hash map is used to lookup the attribute by name. */
- OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+ OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
og->object_name_map.clear();
og->attribute_map.clear();
@@ -1855,8 +1861,8 @@ void GeometryManager::device_update(Device *device,
});
Camera *dicing_camera = scene->dicing_camera;
- dicing_camera->set_screen_size_and_resolution(
- dicing_camera->get_full_width(), dicing_camera->get_full_height(), 1);
+ dicing_camera->set_screen_size(dicing_camera->get_full_width(),
+ dicing_camera->get_full_height());
dicing_camera->update(scene);
size_t i = 0;
@@ -2157,7 +2163,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
#ifdef WITH_OSL
- OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+ OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
if (og) {
og->object_name_map.clear();
diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/render/gpu_display.cpp
new file mode 100644
index 00000000000..a8f0cc50583
--- /dev/null
+++ b/intern/cycles/render/gpu_display.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/gpu_display.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void GPUDisplay::reset(const BufferParams &buffer_params)
+{
+ thread_scoped_lock lock(mutex_);
+
+ const GPUDisplayParams old_params = params_;
+
+ params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+ params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
+ params_.size = make_int2(buffer_params.width, buffer_params.height);
+
+ /* If the parameters did change tag texture as unusable. This avoids drawing old texture content
+ * in an updated configuration of the viewport. For example, avoids drawing old frame when render
+ * border did change.
+ * If the parameters did not change, allow drawing the current state of the texture, which will
+ * not count as an up-to-date redraw. This will avoid flickering when doping camera navigation by
+ * showing a previously rendered frame for until the new one is ready. */
+ if (old_params.modified(params_)) {
+ texture_state_.is_usable = false;
+ }
+
+ texture_state_.is_outdated = true;
+}
+
+void GPUDisplay::mark_texture_updated()
+{
+ texture_state_.is_outdated = false;
+ texture_state_.is_usable = true;
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool GPUDisplay::update_begin(int texture_width, int texture_height)
+{
+ DCHECK(!update_state_.is_active);
+
+ if (update_state_.is_active) {
+ LOG(ERROR) << "Attempt to re-activate update process.";
+ return false;
+ }
+
+ /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+ * The update itself is non-blocking however, for better performance and to avoid
+ * potential deadlocks due to locks held by the subclass. */
+ GPUDisplayParams params;
+ {
+ thread_scoped_lock lock(mutex_);
+ params = params_;
+ texture_state_.size = make_int2(texture_width, texture_height);
+ }
+
+ if (!do_update_begin(params, texture_width, texture_height)) {
+ LOG(ERROR) << "GPUDisplay implementation could not begin update.";
+ return false;
+ }
+
+ update_state_.is_active = true;
+
+ return true;
+}
+
+void GPUDisplay::update_end()
+{
+ DCHECK(update_state_.is_active);
+
+ if (!update_state_.is_active) {
+ LOG(ERROR) << "Attempt to deactivate inactive update process.";
+ return;
+ }
+
+ do_update_end();
+
+ update_state_.is_active = false;
+}
+
+int2 GPUDisplay::get_texture_size() const
+{
+ return texture_state_.size;
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void GPUDisplay::copy_pixels_to_texture(
+ const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+ DCHECK(update_state_.is_active);
+
+ if (!update_state_.is_active) {
+ LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+ return;
+ }
+
+ mark_texture_updated();
+ do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height);
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *GPUDisplay::map_texture_buffer()
+{
+ DCHECK(!texture_buffer_state_.is_mapped);
+ DCHECK(update_state_.is_active);
+
+ if (texture_buffer_state_.is_mapped) {
+ LOG(ERROR) << "Attempt to re-map an already mapped texture buffer.";
+ return nullptr;
+ }
+
+ if (!update_state_.is_active) {
+ LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+ return nullptr;
+ }
+
+ half4 *mapped_rgba_pixels = do_map_texture_buffer();
+
+ if (mapped_rgba_pixels) {
+ texture_buffer_state_.is_mapped = true;
+ }
+
+ return mapped_rgba_pixels;
+}
+
+void GPUDisplay::unmap_texture_buffer()
+{
+ DCHECK(texture_buffer_state_.is_mapped);
+
+ if (!texture_buffer_state_.is_mapped) {
+ LOG(ERROR) << "Attempt to unmap non-mapped texture buffer.";
+ return;
+ }
+
+ texture_buffer_state_.is_mapped = false;
+
+ mark_texture_updated();
+ do_unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
+{
+ DCHECK(!texture_buffer_state_.is_mapped);
+ DCHECK(update_state_.is_active);
+
+ if (texture_buffer_state_.is_mapped) {
+ LOG(ERROR)
+ << "Attempt to use graphics interoperability mode while the texture buffer is mapped.";
+ return DeviceGraphicsInteropDestination();
+ }
+
+ if (!update_state_.is_active) {
+ LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update.";
+ return DeviceGraphicsInteropDestination();
+ }
+
+ /* Assume that interop will write new values to the texture. */
+ mark_texture_updated();
+
+ return do_graphics_interop_get();
+}
+
+void GPUDisplay::graphics_interop_activate()
+{
+}
+
+void GPUDisplay::graphics_interop_deactivate()
+{
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+bool GPUDisplay::draw()
+{
+ /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+ * The drawing itself is non-blocking however, for better performance and to avoid
+ * potential deadlocks due to locks held by the subclass. */
+ GPUDisplayParams params;
+ bool is_usable;
+ bool is_outdated;
+
+ {
+ thread_scoped_lock lock(mutex_);
+ params = params_;
+ is_usable = texture_state_.is_usable;
+ is_outdated = texture_state_.is_outdated;
+ }
+
+ if (is_usable) {
+ do_draw(params);
+ }
+
+ return !is_outdated;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/render/gpu_display.h
new file mode 100644
index 00000000000..a01348d28d5
--- /dev/null
+++ b/intern/cycles/render/gpu_display.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_graphics_interop.h"
+#include "util/util_half.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored
+ * in a GPU-side texture, which is updated from a path tracer and drawn by an application.
+ *
+ * The base GPUDisplay does some special texture state tracking, which allows render Session to
+ * make decisions on whether reset for an updated state is possible or not. This state should only
+ * be tracked in a base class and a particular implementation should not worry about it.
+ *
+ * The subclasses should only implement the pure virtual methods, which allows them to not worry
+ * about parent method calls, which helps them to be as small and reliable as possible. */
+
+class GPUDisplayParams {
+ public:
+ /* Offset of the display within a viewport.
+ * For example, set to a lower-bottom corner of border render in Blender's viewport. */
+ int2 offset = make_int2(0, 0);
+
+ /* Full viewport size.
+ *
+ * NOTE: Is not affected by the resolution divider. */
+ int2 full_size = make_int2(0, 0);
+
+ /* Effective vieport size.
+ * In the case of border render, size of the border rectangle.
+ *
+ * NOTE: Is not affected by the resolution divider. */
+ int2 size = make_int2(0, 0);
+
+ bool modified(const GPUDisplayParams &other) const
+ {
+ return !(offset == other.offset && full_size == other.full_size && size == other.size);
+ }
+};
+
+class GPUDisplay {
+ public:
+ GPUDisplay() = default;
+ virtual ~GPUDisplay() = default;
+
+ /* Reset the display for the new state of render session. Is called whenever session is reset,
+ * which happens on changes like viewport navigation or viewport dimension change.
+ *
+ * This call will configure parameters for a changed buffer and reset the texture state. */
+ void reset(const BufferParams &buffer_params);
+
+ const GPUDisplayParams &get_params() const
+ {
+ return params_;
+ }
+
+ /* --------------------------------------------------------------------
+ * Update procedure.
+ *
+ * These calls indicates a desire of the caller to update content of the displayed texture. */
+
+ /* Returns true when update is ready. Update should be finished with update_end().
+ *
+ * If false is returned then no update is possible, and no update_end() call is needed.
+ *
+ * The texture width and height denotes an actual resolution of the underlying render result. */
+ bool update_begin(int texture_width, int texture_height);
+
+ void update_end();
+
+ /* Get currently configured texture size of the display (as configured by `update_begin()`. */
+ int2 get_texture_size() const;
+
+ /* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ *
+ * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+ *
+ * Most portable implementation, which must be supported by all platforms. Might not be the most
+ * efficient one.
+ */
+
+ /* Copy buffer of rendered pixels of a given size into a given position of the texture.
+ *
+ * This function does not acquire a lock. The reason for this is is to allow use of this function
+ * for partial updates from different devices. In this case the caller will acquire the lock
+ * once, update all the slices and release
+ * the lock once. This will ensure that draw() will never use partially updated texture. */
+ void copy_pixels_to_texture(
+ const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
+
+ /* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ *
+ * This functionality is used to update GPU-side texture content without need to maintain CPU
+ * side buffer on the caller.
+ *
+ * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+ *
+ * NOTE: Texture buffer can not be mapped while graphics interoperability is active. This means
+ * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and
+ * `graphics_interop_end()` calls.
+ */
+
+ /* Map pixels memory form texture to a buffer available for write from CPU. Width and height will
+ * define a requested size of the texture to write to.
+ * Upon success a non-null pointer is returned and the texture buffer is to be unmapped.
+ * If an error happens during mapping, or if mapping is not supported by this GPU display a
+ * null pointer is returned and the buffer is NOT to be unmapped.
+ *
+ * NOTE: Usually the implementation will rely on a GPU context of some sort, and the GPU context
+ * is often can not be bound to two threads simultaneously, and can not be released from a
+ * different thread. This means that the mapping API should be used from the single thread only,
+ */
+ half4 *map_texture_buffer();
+ void unmap_texture_buffer();
+
+ /* --------------------------------------------------------------------
+ * Graphics interoperability.
+ *
+ * A special code path which allows to update texture content directly from the GPU compute
+ * device. Complementary part of DeviceGraphicsInterop.
+ *
+ * NOTE: Graphics interoperability can not be used while the texture buffer is mapped. This means
+ * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and
+ * `unmap_texture_buffer()` calls. */
+
+ /* Get GPUDisplay graphics interoperability information which acts as a destination for the
+ * device API. */
+ DeviceGraphicsInteropDestination graphics_interop_get();
+
+ /* (De)activate GPU display for graphics interoperability outside of regular display update
+ * routines. */
+ virtual void graphics_interop_activate();
+ virtual void graphics_interop_deactivate();
+
+ /* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+ /* Clear the texture by filling it with all zeroes.
+ *
+ * This call might happen in parallel with draw, but can never happen in parallel with the
+ * update.
+ *
+ * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+ * and before pixels update the drawing texture will be fully empty, and that partial update
+ * after clear will write new pixel values for an updating area, leaving everything else zeroed.
+ *
+ * If the GPU display supports graphics interoperability then the zeroing the display is to be
+ * delegated to the device via the `DeviceGraphicsInteropDestination`. */
+ virtual void clear() = 0;
+
+ /* Draw the current state of the texture.
+ *
+ * Returns true if this call did draw an updated state of the texture. */
+ bool draw();
+
+ protected:
+ /* Implementation-specific calls which subclasses are to implement.
+ * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to
+ * simplify their particular implementation. */
+ virtual bool do_update_begin(const GPUDisplayParams &params,
+ int texture_width,
+ int texture_height) = 0;
+ virtual void do_update_end() = 0;
+
+ virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+ int texture_x,
+ int texture_y,
+ int pixels_width,
+ int pixels_height) = 0;
+
+ virtual half4 *do_map_texture_buffer() = 0;
+ virtual void do_unmap_texture_buffer() = 0;
+
+ /* Note that this might be called in parallel to do_update_begin() and do_update_end(),
+ * the subclass is responsible for appropriate mutex locks to avoid multiple threads
+ * editing and drawing the texture at the same time. */
+ virtual void do_draw(const GPUDisplayParams &params) = 0;
+
+ virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0;
+
+ private:
+ thread_mutex mutex_;
+ GPUDisplayParams params_;
+
+ /* Mark texture as its content has been updated.
+ * Used from places which knows that the texture content has been brought up-to-date, so that the
+ * drawing knows whether it can be performed, and whether drawing happened with an up-to-date
+ * texture state. */
+ void mark_texture_updated();
+
+ /* State of the update process. */
+ struct {
+ /* True when update is in process, indicated by `update_begin()` / `update_end()`. */
+ bool is_active = false;
+ } update_state_;
+
+ /* State of the texture, which is needed for an integration with render session and interactive
+ * updates and navigation. */
+ struct {
+ /* Denotes whether possibly existing state of GPU side texture is still usable.
+ * It will not be usable in cases like render border did change (in this case we don't want
+ * previous texture to be rendered at all).
+ *
+ * However, if only navigation or object in scene did change, then the outdated state of the
+ * texture is still usable for draw, preventing display viewport flickering on navigation and
+ * object modifications. */
+ bool is_usable = false;
+
+ /* Texture is considered outdated after `reset()` until the next call of
+ * `copy_pixels_to_texture()`. */
+ bool is_outdated = true;
+
+ /* Texture size in pixels. */
+ int2 size = make_int2(0, 0);
+ } texture_state_;
+
+ /* State of the texture buffer. Is tracked to perform sanity checks. */
+ struct {
+ /* True when the texture buffer is mapped with `map_texture_buffer()`. */
+ bool is_mapped = false;
+ } texture_buffer_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 5102b182593..3584754fad1 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -224,10 +224,6 @@ class ShaderNode : public Node {
{
return false;
}
- virtual bool has_raytrace()
- {
- return false;
- }
vector<ShaderInput *> inputs;
vector<ShaderOutput *> outputs;
@@ -242,22 +238,13 @@ class ShaderNode : public Node {
* that those functions are for selective compilation only?
*/
- /* Nodes are split into several groups, group of level 0 contains
- * nodes which are most commonly used, further levels are extension
- * of previous one and includes less commonly used nodes.
- */
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_0;
- }
-
/* Node feature are used to disable huge nodes inside the group,
* so it's possible to disable huge nodes inside of the required
* nodes group.
*/
virtual int get_feature()
{
- return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP;
+ return bump == SHADER_BUMP_NONE ? 0 : KERNEL_FEATURE_NODE_BUMP;
}
/* Get closure ID to which the node compiles into. */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d8749cec9fa..d74d14242bb 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -53,6 +53,8 @@ NODE_DEFINE(Integrator)
SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
SOCKET_INT(ao_bounces, "AO Bounces", 0);
+ SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
+ SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
@@ -66,33 +68,39 @@ NODE_DEFINE(Integrator)
SOCKET_BOOLEAN(motion_blur, "Motion Blur", false);
SOCKET_INT(aa_samples, "AA Samples", 0);
- SOCKET_INT(diffuse_samples, "Diffuse Samples", 1);
- SOCKET_INT(glossy_samples, "Glossy Samples", 1);
- SOCKET_INT(transmission_samples, "Transmission Samples", 1);
- SOCKET_INT(ao_samples, "AO Samples", 1);
- SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1);
- SOCKET_INT(subsurface_samples, "Subsurface Samples", 1);
- SOCKET_INT(volume_samples, "Volume Samples", 1);
SOCKET_INT(start_sample, "Start Sample", 0);
+ SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
- SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
- SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
- static NodeEnum method_enum;
- method_enum.insert("path", PATH);
- method_enum.insert("branched_path", BRANCHED_PATH);
- SOCKET_ENUM(method, "Method", method_enum, PATH);
-
static NodeEnum sampling_pattern_enum;
sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
- sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
+ static NodeEnum denoiser_type_enum;
+ denoiser_type_enum.insert("optix", DENOISER_OPTIX);
+ denoiser_type_enum.insert("openimagedenoise", DENOISER_OPENIMAGEDENOISE);
+
+ static NodeEnum denoiser_prefilter_enum;
+ denoiser_prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+ denoiser_prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+ denoiser_prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+
+ /* Default to accurate denoising with OpenImageDenoise. For interactive viewport
+ * it's best use OptiX and disable the normal pass since it does not always have
+ * the desired effect for that denoiser. */
+ SOCKET_BOOLEAN(use_denoise, "Use Denoiser", false);
+ SOCKET_ENUM(denoiser_type, "Denoiser Type", denoiser_type_enum, DENOISER_OPENIMAGEDENOISE);
+ SOCKET_INT(denoise_start_sample, "Start Sample to Denoise", 0);
+ SOCKET_BOOLEAN(use_denoise_pass_albedo, "Use Albedo Pass for Denoiser", true);
+ SOCKET_BOOLEAN(use_denoise_pass_normal, "Use Normal Pass for Denoiser", true);
+ SOCKET_ENUM(
+ denoiser_prefilter, "Denoiser Type", denoiser_prefilter_enum, DENOISER_PREFILTER_ACCURATE);
+
return type;
}
@@ -115,13 +123,20 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
}
});
- const bool need_update_lut = ao_samples_is_modified() || diffuse_samples_is_modified() ||
- glossy_samples_is_modified() || max_bounce_is_modified() ||
- max_transmission_bounce_is_modified() ||
- mesh_light_samples_is_modified() || method_is_modified() ||
- sampling_pattern_is_modified() ||
- subsurface_samples_is_modified() ||
- transmission_samples_is_modified() || volume_samples_is_modified();
+ KernelIntegrator *kintegrator = &dscene->data.integrator;
+
+ /* Adaptive sampling requires PMJ samples.
+ *
+ * This also makes detection of sampling pattern a bit more involved: can not rely on the changed
+ * state of socket, since its value might be different from the effective value used here. So
+ * instead compare with previous value in the KernelIntegrator. Only do it if the device was
+ * updated once (in which case the `sample_pattern_lut` will be allocated to a non-zero size). */
+ const SamplingPattern new_sampling_pattern = (use_adaptive_sampling) ? SAMPLING_PATTERN_PMJ :
+ sampling_pattern;
+
+ const bool need_update_lut = max_bounce_is_modified() || max_transmission_bounce_is_modified() ||
+ dscene->sample_pattern_lut.size() == 0 ||
+ kintegrator->sampling_pattern != new_sampling_pattern;
if (need_update_lut) {
dscene->sample_pattern_lut.tag_realloc();
@@ -129,8 +144,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
device_free(device, dscene);
- KernelIntegrator *kintegrator = &dscene->data.integrator;
-
/* integrator parameters */
kintegrator->min_bounce = min_bounce + 1;
kintegrator->max_bounce = max_bounce + 1;
@@ -143,12 +156,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
- if (ao_bounces == 0) {
- kintegrator->ao_bounces = INT_MAX;
- }
- else {
- kintegrator->ao_bounces = ao_bounces - 1;
- }
+ kintegrator->ao_bounces = ao_bounces;
+ kintegrator->ao_bounces_distance = ao_distance;
+ kintegrator->ao_bounces_factor = ao_factor;
/* Transparent Shadows
* We only need to enable transparent shadows, if we actually have
@@ -171,10 +181,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
kintegrator->caustics_refractive = caustics_refractive;
kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;
- kintegrator->seed = hash_uint2(seed, 0);
-
- kintegrator->use_ambient_occlusion = ((Pass::contains(scene->passes, PASS_AO)) ||
- dscene->data.background.ao_factor != 0.0f);
+ kintegrator->seed = seed;
kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
sample_clamp_direct * 3.0f;
@@ -182,51 +189,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
FLT_MAX :
sample_clamp_indirect * 3.0f;
- kintegrator->branched = (method == BRANCHED_PATH) && device->info.has_branched_path;
- kintegrator->volume_decoupled = device->info.has_volume_decoupled;
- kintegrator->diffuse_samples = diffuse_samples;
- kintegrator->glossy_samples = glossy_samples;
- kintegrator->transmission_samples = transmission_samples;
- kintegrator->ao_samples = ao_samples;
- kintegrator->mesh_light_samples = mesh_light_samples;
- kintegrator->subsurface_samples = subsurface_samples;
- kintegrator->volume_samples = volume_samples;
- kintegrator->start_sample = start_sample;
-
- if (kintegrator->branched) {
- kintegrator->sample_all_lights_direct = sample_all_lights_direct;
- kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
- }
- else {
- kintegrator->sample_all_lights_direct = false;
- kintegrator->sample_all_lights_indirect = false;
- }
-
- kintegrator->sampling_pattern = sampling_pattern;
- kintegrator->aa_samples = aa_samples;
- if (aa_samples > 0 && adaptive_min_samples == 0) {
- kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
- VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
- << kintegrator->adaptive_min_samples;
- }
- else {
- kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
- }
-
- kintegrator->adaptive_step = 4;
- kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
-
- /* Adaptive step must be a power of two for bitwise operations to work. */
- assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
-
- if (aa_samples > 0 && adaptive_threshold == 0.0f) {
- kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
- VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
- << kintegrator->adaptive_threshold;
- }
- else {
- kintegrator->adaptive_threshold = adaptive_threshold;
- }
+ kintegrator->sampling_pattern = new_sampling_pattern;
if (light_sampling_threshold > 0.0f) {
kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -236,29 +199,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
}
/* sobol directions table */
- int max_samples = 1;
-
- if (kintegrator->branched) {
- foreach (Light *light, scene->lights)
- max_samples = max(max_samples, light->get_samples());
-
- max_samples = max(max_samples,
- max(diffuse_samples, max(glossy_samples, transmission_samples)));
- max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples)));
- max_samples = max(max_samples, volume_samples);
- }
-
- uint total_bounces = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
- max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
-
- max_samples *= total_bounces;
+ int max_samples = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
+ max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
if (need_update_lut) {
- if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
- uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
+ if (kintegrator->sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+ uint *directions = (uint *)dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
@@ -276,10 +225,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
}
pool.wait_work();
+
dscene->sample_pattern_lut.copy_to_device();
}
}
+ kintegrator->has_shadow_catcher = scene->has_shadow_catcher();
+
dscene->sample_pattern_lut.clear_modified();
clear_modified();
}
@@ -295,17 +247,12 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
tag_modified();
}
- if (flag & (AO_PASS_MODIFIED | BACKGROUND_AO_MODIFIED)) {
+ if (flag & AO_PASS_MODIFIED) {
/* tag only the ao_bounces socket as modified so we avoid updating sample_pattern_lut
* unnecessarily */
tag_ao_bounces_modified();
}
- if ((flag & LIGHT_SAMPLES_MODIFIED) && (method == BRANCHED_PATH)) {
- /* the number of light samples may affect the size of the sample_pattern_lut */
- tag_sampling_pattern_modified();
- }
-
if (filter_glossy_is_modified()) {
foreach (Shader *shader, scene->shaders) {
if (shader->has_integrator_dependency) {
@@ -321,4 +268,65 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
}
}
+AdaptiveSampling Integrator::get_adaptive_sampling() const
+{
+ AdaptiveSampling adaptive_sampling;
+
+ adaptive_sampling.use = use_adaptive_sampling;
+
+ if (!adaptive_sampling.use) {
+ return adaptive_sampling;
+ }
+
+ if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+ adaptive_sampling.threshold = max(0.001f, 1.0f / (float)aa_samples);
+ VLOG(1) << "Cycles adaptive sampling: automatic threshold = " << adaptive_sampling.threshold;
+ }
+ else {
+ adaptive_sampling.threshold = adaptive_threshold;
+ }
+
+ if (adaptive_sampling.threshold > 0 && adaptive_min_samples == 0) {
+ /* Threshold 0.1 -> 32, 0.01 -> 64, 0.001 -> 128.
+ * This is highly scene dependent, we make a guess that seemed to work well
+ * in various test scenes. */
+ const int min_samples = (int)ceilf(16.0f / powf(adaptive_sampling.threshold, 0.3f));
+ adaptive_sampling.min_samples = max(4, min_samples);
+ VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+ << adaptive_sampling.min_samples;
+ }
+ else {
+ adaptive_sampling.min_samples = max(4, adaptive_min_samples);
+ }
+
+ /* Arbitrary factor that makes the threshold more similar to what is was before,
+ * and gives arguably more intuitive values. */
+ adaptive_sampling.threshold *= 5.0f;
+
+ adaptive_sampling.adaptive_step = 16;
+
+ DCHECK(is_power_of_two(adaptive_sampling.adaptive_step))
+ << "Adaptive step must be a power of two for bitwise operations to work";
+
+ return adaptive_sampling;
+}
+
+DenoiseParams Integrator::get_denoise_params() const
+{
+ DenoiseParams denoise_params;
+
+ denoise_params.use = use_denoise;
+
+ denoise_params.type = denoiser_type;
+
+ denoise_params.start_sample = denoise_start_sample;
+
+ denoise_params.use_pass_albedo = use_denoise_pass_albedo;
+ denoise_params.use_pass_normal = use_denoise_pass_normal;
+
+ denoise_params.prefilter = denoiser_prefilter;
+
+ return denoise_params;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 4eeeda92d41..32e108d62ca 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,9 @@
#include "kernel/kernel_types.h"
+#include "device/device_denoise.h" /* For the paramaters and type enum. */
#include "graph/node.h"
+#include "integrator/adaptive_sampling.h"
CCL_NAMESPACE_BEGIN
@@ -43,6 +45,8 @@ class Integrator : public Node {
NODE_SOCKET_API(int, transparent_max_bounce)
NODE_SOCKET_API(int, ao_bounces)
+ NODE_SOCKET_API(float, ao_factor)
+ NODE_SOCKET_API(float, ao_distance)
NODE_SOCKET_API(int, volume_max_steps)
NODE_SOCKET_API(float, volume_step_rate)
@@ -62,37 +66,26 @@ class Integrator : public Node {
static const int MAX_SAMPLES = (1 << 24);
NODE_SOCKET_API(int, aa_samples)
- NODE_SOCKET_API(int, diffuse_samples)
- NODE_SOCKET_API(int, glossy_samples)
- NODE_SOCKET_API(int, transmission_samples)
- NODE_SOCKET_API(int, ao_samples)
- NODE_SOCKET_API(int, mesh_light_samples)
- NODE_SOCKET_API(int, subsurface_samples)
- NODE_SOCKET_API(int, volume_samples)
NODE_SOCKET_API(int, start_sample)
- NODE_SOCKET_API(bool, sample_all_lights_direct)
- NODE_SOCKET_API(bool, sample_all_lights_indirect)
NODE_SOCKET_API(float, light_sampling_threshold)
+ NODE_SOCKET_API(bool, use_adaptive_sampling)
NODE_SOCKET_API(int, adaptive_min_samples)
NODE_SOCKET_API(float, adaptive_threshold)
- enum Method {
- BRANCHED_PATH = 0,
- PATH = 1,
-
- NUM_METHODS,
- };
-
- NODE_SOCKET_API(Method, method)
-
NODE_SOCKET_API(SamplingPattern, sampling_pattern)
+ NODE_SOCKET_API(bool, use_denoise);
+ NODE_SOCKET_API(DenoiserType, denoiser_type);
+ NODE_SOCKET_API(int, denoise_start_sample);
+ NODE_SOCKET_API(bool, use_denoise_pass_albedo);
+ NODE_SOCKET_API(bool, use_denoise_pass_normal);
+ NODE_SOCKET_API(DenoiserPrefilter, denoiser_prefilter);
+
enum : uint32_t {
AO_PASS_MODIFIED = (1 << 0),
- BACKGROUND_AO_MODIFIED = (1 << 1),
- LIGHT_SAMPLES_MODIFIED = (1 << 2),
+ OBJECT_MANAGER = (1 << 1),
/* tag everything in the manager for an update */
UPDATE_ALL = ~0u,
@@ -107,6 +100,9 @@ class Integrator : public Node {
void device_free(Device *device, DeviceScene *dscene, bool force_free = false);
void tag_update(Scene *scene, uint32_t flag);
+
+ AdaptiveSampling get_adaptive_sampling() const;
+ DenoiseParams get_denoise_params() const;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
index fc47b0e8f0a..e31f8abd446 100644
--- a/intern/cycles/render/jitter.cpp
+++ b/intern/cycles/render/jitter.cpp
@@ -242,12 +242,6 @@ class PMJ02_Generator : public PMJ_Generator {
static void shuffle(float2 points[], int size, int rng_seed)
{
- /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
- for (int i = 0; i < size; ++i) {
- points[i].x += 1.0f;
- points[i].y += 1.0f;
- }
-
if (rng_seed == 0) {
return;
}
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 15aa4e047b5..ae1150fc07b 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,13 @@
* limitations under the License.
*/
-#include "render/light.h"
#include "device/device.h"
+
#include "render/background.h"
#include "render/film.h"
#include "render/graph.h"
#include "render/integrator.h"
+#include "render/light.h"
#include "render/mesh.h"
#include "render/nodes.h"
#include "render/object.h"
@@ -27,6 +28,8 @@
#include "render/shader.h"
#include "render/stats.h"
+#include "integrator/shader_eval.h"
+
#include "util/util_foreach.h"
#include "util/util_hash.h"
#include "util/util_logging.h"
@@ -43,63 +46,49 @@ static void shade_background_pixels(Device *device,
vector<float3> &pixels,
Progress &progress)
{
- /* create input */
- device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY);
- device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE);
-
- uint4 *d_input_data = d_input.alloc(width * height);
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- float u = (x + 0.5f) / width;
- float v = (y + 0.5f) / height;
-
- uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0);
- d_input_data[x + y * width] = in;
- }
- }
-
- /* compute on device */
- d_output.alloc(width * height);
- d_output.zero_to_device();
- d_input.copy_to_device();
-
+ /* Needs to be up to data for attribute access. */
device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
- DeviceTask main_task(DeviceTask::SHADER);
- main_task.shader_input = d_input.device_pointer;
- main_task.shader_output = d_output.device_pointer;
- main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
- main_task.shader_x = 0;
- main_task.shader_w = width * height;
- main_task.num_samples = 1;
- main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
- /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
- list<DeviceTask> split_tasks;
- main_task.split(split_tasks, 1, 128 * 128);
-
- foreach (DeviceTask &task, split_tasks) {
- device->task_add(task);
- device->task_wait();
- d_output.copy_from_device(task.shader_x, 1, task.shader_w);
- }
-
- d_input.free();
-
- float4 *d_output_data = d_output.data();
-
- pixels.resize(width * height);
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- pixels[y * width + x].x = d_output_data[y * width + x].x;
- pixels[y * width + x].y = d_output_data[y * width + x].y;
- pixels[y * width + x].z = d_output_data[y * width + x].z;
- }
- }
+ const int size = width * height;
+ pixels.resize(size);
+
+ /* Evaluate shader on device. */
+ ShaderEval shader_eval(device, progress);
+ shader_eval.eval(
+ SHADER_EVAL_BACKGROUND,
+ size,
+ [&](device_vector<KernelShaderEvalInput> &d_input) {
+ /* Fill coordinates for shading. */
+ KernelShaderEvalInput *d_input_data = d_input.data();
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ float u = (x + 0.5f) / width;
+ float v = (y + 0.5f) / height;
+
+ KernelShaderEvalInput in;
+ in.object = OBJECT_NONE;
+ in.prim = PRIM_NONE;
+ in.u = u;
+ in.v = v;
+ d_input_data[x + y * width] = in;
+ }
+ }
- d_output.free();
+ return size;
+ },
+ [&](device_vector<float4> &d_output) {
+ /* Copy output to pixel buffer. */
+ float4 *d_output_data = d_output.data();
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ pixels[y * width + x].x = d_output_data[y * width + x].x;
+ pixels[y * width + x].y = d_output_data[y * width + x].y;
+ pixels[y * width + x].z = d_output_data[y * width + x].z;
+ }
+ }
+ });
}
/* Light */
@@ -140,15 +129,16 @@ NODE_DEFINE(Light)
SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true);
SOCKET_BOOLEAN(use_mis, "Use Mis", false);
+ SOCKET_BOOLEAN(use_camera, "Use Camera", true);
SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
SOCKET_BOOLEAN(use_scatter, "Use Scatter", true);
- SOCKET_INT(samples, "Samples", 1);
SOCKET_INT(max_bounces, "Max Bounces", 1024);
SOCKET_UINT(random_id, "Random ID", 0);
+ SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", true);
SOCKET_BOOLEAN(is_portal, "Is Portal", false);
SOCKET_BOOLEAN(is_enabled, "Is Enabled", true);
@@ -166,10 +156,6 @@ void Light::tag_update(Scene *scene)
{
if (is_modified()) {
scene->light_manager->tag_update(scene, LightManager::LIGHT_MODIFIED);
-
- if (samples_is_modified()) {
- scene->integrator->tag_update(scene, Integrator::LIGHT_SAMPLES_MODIFIED);
- }
}
}
@@ -193,7 +179,6 @@ LightManager::LightManager()
{
update_flags = UPDATE_ALL;
need_update_background = true;
- use_light_visibility = false;
last_background_enabled = false;
last_background_resolution = 0;
}
@@ -357,21 +342,23 @@ void LightManager::device_update_distribution(Device *,
int object_id = j;
int shader_flag = 0;
+ if (!(object->get_visibility() & PATH_RAY_CAMERA)) {
+ shader_flag |= SHADER_EXCLUDE_CAMERA;
+ }
if (!(object->get_visibility() & PATH_RAY_DIFFUSE)) {
shader_flag |= SHADER_EXCLUDE_DIFFUSE;
- use_light_visibility = true;
}
if (!(object->get_visibility() & PATH_RAY_GLOSSY)) {
shader_flag |= SHADER_EXCLUDE_GLOSSY;
- use_light_visibility = true;
}
if (!(object->get_visibility() & PATH_RAY_TRANSMIT)) {
shader_flag |= SHADER_EXCLUDE_TRANSMIT;
- use_light_visibility = true;
}
if (!(object->get_visibility() & PATH_RAY_VOLUME_SCATTER)) {
shader_flag |= SHADER_EXCLUDE_SCATTER;
- use_light_visibility = true;
+ }
+ if (!(object->get_is_shadow_catcher())) {
+ shader_flag |= SHADER_EXCLUDE_SHADOW_CATCHER;
}
size_t mesh_num_triangles = mesh->num_triangles();
@@ -496,10 +483,10 @@ void LightManager::device_update_distribution(Device *,
kfilm->pass_shadow_scale = 1.0f;
if (kintegrator->pdf_triangles != 0.0f)
- kfilm->pass_shadow_scale *= 0.5f;
+ kfilm->pass_shadow_scale /= 0.5f;
if (num_background_lights < num_lights)
- kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights) / (float)num_lights;
+ kfilm->pass_shadow_scale /= (float)(num_lights - num_background_lights) / (float)num_lights;
/* CDF */
dscene->light_distribution.copy_to_device();
@@ -766,25 +753,26 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
if (!light->cast_shadow)
shader_id &= ~SHADER_CAST_SHADOW;
+ if (!light->use_camera) {
+ shader_id |= SHADER_EXCLUDE_CAMERA;
+ }
if (!light->use_diffuse) {
shader_id |= SHADER_EXCLUDE_DIFFUSE;
- use_light_visibility = true;
}
if (!light->use_glossy) {
shader_id |= SHADER_EXCLUDE_GLOSSY;
- use_light_visibility = true;
}
if (!light->use_transmission) {
shader_id |= SHADER_EXCLUDE_TRANSMIT;
- use_light_visibility = true;
}
if (!light->use_scatter) {
shader_id |= SHADER_EXCLUDE_SCATTER;
- use_light_visibility = true;
+ }
+ if (!light->is_shadow_catcher) {
+ shader_id |= SHADER_EXCLUDE_SHADOW_CATCHER;
}
klights[light_index].type = light->light_type;
- klights[light_index].samples = light->samples;
klights[light_index].strength[0] = light->strength.x;
klights[light_index].strength[1] = light->strength.y;
klights[light_index].strength[2] = light->strength.z;
@@ -836,19 +824,15 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
if (!(visibility & PATH_RAY_DIFFUSE)) {
shader_id |= SHADER_EXCLUDE_DIFFUSE;
- use_light_visibility = true;
}
if (!(visibility & PATH_RAY_GLOSSY)) {
shader_id |= SHADER_EXCLUDE_GLOSSY;
- use_light_visibility = true;
}
if (!(visibility & PATH_RAY_TRANSMIT)) {
shader_id |= SHADER_EXCLUDE_TRANSMIT;
- use_light_visibility = true;
}
if (!(visibility & PATH_RAY_VOLUME_SCATTER)) {
shader_id |= SHADER_EXCLUDE_SCATTER;
- use_light_visibility = true;
}
}
else if (light->light_type == LIGHT_AREA) {
@@ -998,8 +982,6 @@ void LightManager::device_update(Device *device,
device_free(device, dscene, need_update_background);
- use_light_visibility = false;
-
device_update_points(device, dscene, scene);
if (progress.get_cancel())
return;
@@ -1018,8 +1000,6 @@ void LightManager::device_update(Device *device,
if (progress.get_cancel())
return;
- scene->film->set_use_light_visibility(use_light_visibility);
-
update_flags = UPDATE_NONE;
need_update_background = false;
}
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index fbd709125ff..7f86237c8b3 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -69,16 +69,17 @@ class Light : public Node {
NODE_SOCKET_API(bool, cast_shadow)
NODE_SOCKET_API(bool, use_mis)
+ NODE_SOCKET_API(bool, use_camera)
NODE_SOCKET_API(bool, use_diffuse)
NODE_SOCKET_API(bool, use_glossy)
NODE_SOCKET_API(bool, use_transmission)
NODE_SOCKET_API(bool, use_scatter)
+ NODE_SOCKET_API(bool, is_shadow_catcher)
NODE_SOCKET_API(bool, is_portal)
NODE_SOCKET_API(bool, is_enabled)
NODE_SOCKET_API(Shader *, shader)
- NODE_SOCKET_API(int, samples)
NODE_SOCKET_API(int, max_bounces)
NODE_SOCKET_API(uint, random_id)
@@ -108,8 +109,6 @@ class LightManager {
UPDATE_NONE = 0u,
};
- bool use_light_visibility;
-
/* Need to update background (including multiple importance map) */
bool need_update_background;
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index b39d81023d9..c00c4c24211 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -16,6 +16,8 @@
#include "device/device.h"
+#include "integrator/shader_eval.h"
+
#include "render/mesh.h"
#include "render/object.h"
#include "render/scene.h"
@@ -43,40 +45,28 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
return norm / normlen;
}
-bool GeometryManager::displace(
- Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+/* Fill in coordinates for mesh displacement shader evaluation on device. */
+static int fill_shader_input(const Scene *scene,
+ const Mesh *mesh,
+ const int object_index,
+ device_vector<KernelShaderEvalInput> &d_input)
{
- /* verify if we have a displacement shader */
- if (!mesh->has_true_displacement()) {
- return false;
- }
-
- string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
- progress.set_status("Updating Mesh", msg);
+ int d_input_size = 0;
+ KernelShaderEvalInput *d_input_data = d_input.data();
- /* find object index. todo: is arbitrary */
- size_t object_index = OBJECT_NONE;
+ const array<int> &mesh_shaders = mesh->get_shader();
+ const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+ const array<float3> &mesh_verts = mesh->get_verts();
- for (size_t i = 0; i < scene->objects.size(); i++) {
- if (scene->objects[i]->get_geometry() == mesh) {
- object_index = i;
- break;
- }
- }
-
- /* setup input for device task */
- const size_t num_verts = mesh->verts.size();
+ const int num_verts = mesh_verts.size();
vector<bool> done(num_verts, false);
- device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY);
- uint4 *d_input_data = d_input.alloc(num_verts);
- size_t d_input_size = 0;
- size_t num_triangles = mesh->num_triangles();
- for (size_t i = 0; i < num_triangles; i++) {
+ int num_triangles = mesh->num_triangles();
+ for (int i = 0; i < num_triangles; i++) {
Mesh::Triangle t = mesh->get_triangle(i);
- int shader_index = mesh->shader[i];
- Shader *shader = (shader_index < mesh->used_shaders.size()) ?
- static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+ int shader_index = mesh_shaders[i];
+ Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+ static_cast<Shader *>(mesh_used_shaders[shader_index]) :
scene->default_surface;
if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -110,57 +100,41 @@ bool GeometryManager::displace(
}
/* back */
- uint4 in = make_uint4(object, prim, __float_as_int(u), __float_as_int(v));
+ KernelShaderEvalInput in;
+ in.object = object;
+ in.prim = prim;
+ in.u = u;
+ in.v = v;
d_input_data[d_input_size++] = in;
}
}
- if (d_input_size == 0)
- return false;
-
- /* run device task */
- device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE);
- d_output.alloc(d_input_size);
- d_output.zero_to_device();
- d_input.copy_to_device();
-
- /* needs to be up to data for attribute access */
- device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
-
- DeviceTask task(DeviceTask::SHADER);
- task.shader_input = d_input.device_pointer;
- task.shader_output = d_output.device_pointer;
- task.shader_eval_type = SHADER_EVAL_DISPLACE;
- task.shader_x = 0;
- task.shader_w = d_output.size();
- task.num_samples = 1;
- task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
- device->task_add(task);
- device->task_wait();
-
- if (progress.get_cancel()) {
- d_input.free();
- d_output.free();
- return false;
- }
+ return d_input_size;
+}
- d_output.copy_from_device(0, 1, d_output.size());
- d_input.free();
+/* Read back mesh displacement shader output. */
+static void read_shader_output(const Scene *scene,
+ Mesh *mesh,
+ const device_vector<float4> &d_output)
+{
+ const array<int> &mesh_shaders = mesh->get_shader();
+ const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+ array<float3> &mesh_verts = mesh->get_verts();
- /* read result */
- done.clear();
- done.resize(num_verts, false);
- int k = 0;
+ const int num_verts = mesh_verts.size();
+ const int num_motion_steps = mesh->get_motion_steps();
+ vector<bool> done(num_verts, false);
- float4 *offset = d_output.data();
+ const float4 *d_output_data = d_output.data();
+ int d_output_index = 0;
Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
- for (size_t i = 0; i < num_triangles; i++) {
+ int num_triangles = mesh->num_triangles();
+ for (int i = 0; i < num_triangles; i++) {
Mesh::Triangle t = mesh->get_triangle(i);
- int shader_index = mesh->shader[i];
- Shader *shader = (shader_index < mesh->used_shaders.size()) ?
- static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+ int shader_index = mesh_shaders[i];
+ Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+ static_cast<Shader *>(mesh_used_shaders[shader_index]) :
scene->default_surface;
if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -170,12 +144,12 @@ bool GeometryManager::displace(
for (int j = 0; j < 3; j++) {
if (!done[t.v[j]]) {
done[t.v[j]] = true;
- float3 off = float4_to_float3(offset[k++]);
+ float3 off = float4_to_float3(d_output_data[d_output_index++]);
/* Avoid illegal vertex coordinates. */
off = ensure_finite3(off);
- mesh->verts[t.v[j]] += off;
+ mesh_verts[t.v[j]] += off;
if (attr_mP != NULL) {
- for (int step = 0; step < mesh->motion_steps - 1; step++) {
+ for (int step = 0; step < num_motion_steps - 1; step++) {
float3 *mP = attr_mP->data_float3() + step * num_verts;
mP[t.v[j]] += off;
}
@@ -183,8 +157,47 @@ bool GeometryManager::displace(
}
}
}
+}
- d_output.free();
+bool GeometryManager::displace(
+ Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+{
+ /* verify if we have a displacement shader */
+ if (!mesh->has_true_displacement()) {
+ return false;
+ }
+
+ const size_t num_verts = mesh->verts.size();
+ const size_t num_triangles = mesh->num_triangles();
+
+ if (num_triangles == 0) {
+ return false;
+ }
+
+ string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
+ progress.set_status("Updating Mesh", msg);
+
+ /* find object index. todo: is arbitrary */
+ size_t object_index = OBJECT_NONE;
+
+ for (size_t i = 0; i < scene->objects.size(); i++) {
+ if (scene->objects[i]->get_geometry() == mesh) {
+ object_index = i;
+ break;
+ }
+ }
+
+ /* Needs to be up to data for attribute access. */
+ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+ /* Evaluate shader on device. */
+ ShaderEval shader_eval(device, progress);
+ if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
+ num_verts,
+ function_bind(&fill_shader_input, scene, mesh, object_index, _1),
+ function_bind(&read_shader_output, scene, mesh, _1))) {
+ return false;
+ }
/* stitch */
unordered_set<int> stitch_keys;
@@ -297,8 +310,7 @@ bool GeometryManager::displace(
}
/* normalize vertex normals */
- done.clear();
- done.resize(num_verts, false);
+ vector<bool> done(num_verts, false);
for (size_t i = 0; i < num_triangles; i++) {
if (tri_has_true_disp[i]) {
@@ -368,8 +380,7 @@ bool GeometryManager::displace(
}
/* normalize vertex normals */
- done.clear();
- done.resize(num_verts, false);
+ vector<bool> done(num_verts, false);
for (size_t i = 0; i < num_triangles; i++) {
if (tri_has_true_disp[i]) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 795166bcf4c..5303d55242e 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2736,18 +2736,21 @@ NODE_DEFINE(PrincipledBsdfNode)
distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
static NodeEnum subsurface_method_enum;
- subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID);
- subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+ subsurface_method_enum.insert("random_walk_fixed_radius",
+ CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+ subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
SOCKET_ENUM(subsurface_method,
"Subsurface Method",
subsurface_method_enum,
- CLOSURE_BSSRDF_PRINCIPLED_ID);
+ CLOSURE_BSSRDF_RANDOM_WALK_ID);
SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+ SOCKET_IN_FLOAT(subsurface_ior, "Subsurface IOR", 1.4f);
+ SOCKET_IN_FLOAT(subsurface_anisotropy, "Subsurface Anisotropy", 0.0f);
SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
@@ -2857,6 +2860,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
ShaderInput *p_metallic,
ShaderInput *p_subsurface,
ShaderInput *p_subsurface_radius,
+ ShaderInput *p_subsurface_ior,
+ ShaderInput *p_subsurface_anisotropy,
ShaderInput *p_specular,
ShaderInput *p_roughness,
ShaderInput *p_specular_tint,
@@ -2896,6 +2901,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+ int subsurface_ior_offset = compiler.stack_assign(p_subsurface_ior);
+ int subsurface_anisotropy_offset = compiler.stack_assign(p_subsurface_anisotropy);
compiler.add_node(NODE_CLOSURE_BSDF,
compiler.encode_uchar4(closure,
@@ -2929,8 +2936,10 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
__float_as_int(bc_default.y),
__float_as_int(bc_default.z));
- compiler.add_node(
- clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+ compiler.add_node(clearcoat_normal_offset,
+ subsurface_radius_offset,
+ subsurface_ior_offset,
+ subsurface_anisotropy_offset);
float3 ss_default = get_float3(subsurface_color_in->socket_type);
@@ -2953,6 +2962,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler)
input("Metallic"),
input("Subsurface"),
input("Subsurface Radius"),
+ input("Subsurface IOR"),
+ input("Subsurface Anisotropy"),
input("Specular"),
input("Roughness"),
input("Specular Tint"),
@@ -3048,16 +3059,16 @@ NODE_DEFINE(SubsurfaceScatteringNode)
SOCKET_IN_NORMAL(normal, "Normal", zero_float3(), SocketType::LINK_NORMAL);
SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
- static NodeEnum falloff_enum;
- falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID);
- falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID);
- falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID);
- falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
- SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID);
+ static NodeEnum method_enum;
+ method_enum.insert("random_walk_fixed_radius", CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+ method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
+ SOCKET_ENUM(method, "Method", method_enum, CLOSURE_BSSRDF_RANDOM_WALK_ID);
+
SOCKET_IN_FLOAT(scale, "Scale", 0.01f);
SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f));
- SOCKET_IN_FLOAT(sharpness, "Sharpness", 0.0f);
- SOCKET_IN_FLOAT(texture_blur, "Texture Blur", 1.0f);
+
+ SOCKET_IN_FLOAT(subsurface_ior, "IOR", 1.4f);
+ SOCKET_IN_FLOAT(subsurface_anisotropy, "Anisotropy", 0.0f);
SOCKET_OUT_CLOSURE(BSSRDF, "BSSRDF");
@@ -3066,20 +3077,19 @@ NODE_DEFINE(SubsurfaceScatteringNode)
SubsurfaceScatteringNode::SubsurfaceScatteringNode() : BsdfNode(get_node_type())
{
- closure = falloff;
+ closure = method;
}
void SubsurfaceScatteringNode::compile(SVMCompiler &compiler)
{
- closure = falloff;
- BsdfNode::compile(
- compiler, input("Scale"), input("Texture Blur"), input("Radius"), input("Sharpness"));
+ closure = method;
+ BsdfNode::compile(compiler, input("Scale"), input("IOR"), input("Radius"), input("Anisotropy"));
}
void SubsurfaceScatteringNode::compile(OSLCompiler &compiler)
{
- closure = falloff;
- compiler.parameter(this, "falloff");
+ closure = method;
+ compiler.parameter(this, "method");
compiler.add(this, "node_subsurface_scattering");
}
@@ -3786,20 +3796,6 @@ void GeometryNode::compile(OSLCompiler &compiler)
compiler.add(this, "node_geometry");
}
-int GeometryNode::get_group()
-{
- ShaderOutput *out;
- int result = ShaderNode::get_group();
-
- /* Backfacing uses NODE_LIGHT_PATH */
- out = output("Backfacing");
- if (!out->links.empty()) {
- result = max(result, NODE_GROUP_LEVEL_1);
- }
-
- return result;
-}
-
/* TextureCoordinate */
NODE_DEFINE(TextureCoordinateNode)
@@ -5926,33 +5922,33 @@ NODE_DEFINE(OutputAOVNode)
OutputAOVNode::OutputAOVNode() : ShaderNode(get_node_type())
{
special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV;
- slot = -1;
+ offset = -1;
}
void OutputAOVNode::simplify_settings(Scene *scene)
{
- slot = scene->film->get_aov_offset(scene, name.string(), is_color);
- if (slot == -1) {
- slot = scene->film->get_aov_offset(scene, name.string(), is_color);
+ offset = scene->film->get_aov_offset(scene, name.string(), is_color);
+ if (offset == -1) {
+ offset = scene->film->get_aov_offset(scene, name.string(), is_color);
}
- if (slot == -1 || is_color) {
+ if (offset == -1 || is_color) {
input("Value")->disconnect();
}
- if (slot == -1 || !is_color) {
+ if (offset == -1 || !is_color) {
input("Color")->disconnect();
}
}
void OutputAOVNode::compile(SVMCompiler &compiler)
{
- assert(slot >= 0);
+ assert(offset >= 0);
if (is_color) {
- compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot);
+ compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), offset);
}
else {
- compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot);
+ compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), offset);
}
}
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 3013e9b1866..22bdb06b059 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -143,10 +143,6 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
virtual bool equals(const ShaderNode &other)
{
@@ -170,11 +166,6 @@ class SkyTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(SkyTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
NODE_SOCKET_API(NodeSkyType, sky_type)
NODE_SOCKET_API(float3, sun_direction)
NODE_SOCKET_API(float, turbidity)
@@ -224,18 +215,13 @@ class OutputAOVNode : public ShaderNode {
NODE_SOCKET_API(ustring, name)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_4;
- }
-
/* Don't allow output node de-duplication. */
virtual bool equals(const ShaderNode & /*other*/)
{
return false;
}
- int slot;
+ int offset;
bool is_color;
};
@@ -243,11 +229,6 @@ class GradientTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(GradientTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
NODE_SOCKET_API(NodeGradientType, gradient_type)
NODE_SOCKET_API(float3, vector)
};
@@ -269,19 +250,14 @@ class VoronoiTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(VoronoiTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
virtual int get_feature()
{
int result = ShaderNode::get_feature();
if (dimensions == 4) {
- result |= NODE_FEATURE_VORONOI_EXTRA;
+ result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
}
else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) {
- result |= NODE_FEATURE_VORONOI_EXTRA;
+ result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
}
return result;
}
@@ -301,11 +277,6 @@ class MusgraveTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(MusgraveTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
NODE_SOCKET_API(int, dimensions)
NODE_SOCKET_API(NodeMusgraveType, musgrave_type)
NODE_SOCKET_API(float, w)
@@ -322,11 +293,6 @@ class WaveTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(WaveTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
NODE_SOCKET_API(NodeWaveType, wave_type)
NODE_SOCKET_API(NodeWaveBandsDirection, bands_direction)
NODE_SOCKET_API(NodeWaveRingsDirection, rings_direction)
@@ -345,11 +311,6 @@ class MagicTextureNode : public TextureNode {
public:
SHADER_NODE_CLASS(MagicTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
-
NODE_SOCKET_API(int, depth)
NODE_SOCKET_API(float3, vector)
NODE_SOCKET_API(float, scale)
@@ -364,11 +325,6 @@ class CheckerTextureNode : public TextureNode {
NODE_SOCKET_API(float3, color1)
NODE_SOCKET_API(float3, color2)
NODE_SOCKET_API(float, scale)
-
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
};
class BrickTextureNode : public TextureNode {
@@ -390,20 +346,11 @@ class BrickTextureNode : public TextureNode {
NODE_SOCKET_API(float, brick_width)
NODE_SOCKET_API(float, row_height)
NODE_SOCKET_API(float3, vector)
-
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
};
class PointDensityTextureNode : public ShaderNode {
public:
SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_4;
- }
~PointDensityTextureNode();
ShaderNode *clone(ShaderGraph *graph) const;
@@ -443,10 +390,6 @@ class IESLightNode : public TextureNode {
~IESLightNode();
ShaderNode *clone(ShaderGraph *graph) const;
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
NODE_SOCKET_API(ustring, filename)
NODE_SOCKET_API(ustring, ies)
@@ -464,10 +407,6 @@ class IESLightNode : public TextureNode {
class WhiteNoiseTextureNode : public ShaderNode {
public:
SHADER_NODE_CLASS(WhiteNoiseTextureNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
NODE_SOCKET_API(int, dimensions)
NODE_SOCKET_API(float3, vector)
@@ -477,10 +416,6 @@ class WhiteNoiseTextureNode : public ShaderNode {
class MappingNode : public ShaderNode {
public:
SHADER_NODE_CLASS(MappingNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
void constant_fold(const ConstantFolder &folder);
NODE_SOCKET_API(float3, vector)
@@ -546,6 +481,11 @@ class BsdfBaseNode : public ShaderNode {
return false;
}
+ virtual int get_feature()
+ {
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_BSDF;
+ }
+
protected:
ClosureType closure;
};
@@ -606,6 +546,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
ShaderInput *metallic,
ShaderInput *subsurface,
ShaderInput *subsurface_radius,
+ ShaderInput *subsurface_ior,
+ ShaderInput *subsurface_anisotropy,
ShaderInput *specular,
ShaderInput *roughness,
ShaderInput *specular_tint,
@@ -622,6 +564,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
NODE_SOCKET_API(float3, base_color)
NODE_SOCKET_API(float3, subsurface_color)
NODE_SOCKET_API(float3, subsurface_radius)
+ NODE_SOCKET_API(float, subsurface_ior)
+ NODE_SOCKET_API(float, subsurface_anisotropy)
NODE_SOCKET_API(float, metallic)
NODE_SOCKET_API(float, subsurface)
NODE_SOCKET_API(float, specular)
@@ -758,14 +702,14 @@ class SubsurfaceScatteringNode : public BsdfNode {
bool has_bssrdf_bump();
ClosureType get_closure_type()
{
- return falloff;
+ return method;
}
NODE_SOCKET_API(float, scale)
NODE_SOCKET_API(float3, radius)
- NODE_SOCKET_API(float, sharpness)
- NODE_SOCKET_API(float, texture_blur)
- NODE_SOCKET_API(ClosureType, falloff)
+ NODE_SOCKET_API(float, subsurface_ior)
+ NODE_SOCKET_API(float, subsurface_anisotropy)
+ NODE_SOCKET_API(ClosureType, method)
};
class EmissionNode : public ShaderNode {
@@ -782,6 +726,11 @@ class EmissionNode : public ShaderNode {
return true;
}
+ virtual int get_feature()
+ {
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+ }
+
NODE_SOCKET_API(float3, color)
NODE_SOCKET_API(float, strength)
NODE_SOCKET_API(float, surface_mix_weight)
@@ -792,6 +741,11 @@ class BackgroundNode : public ShaderNode {
SHADER_NODE_CLASS(BackgroundNode)
void constant_fold(const ConstantFolder &folder);
+ virtual int get_feature()
+ {
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+ }
+
NODE_SOCKET_API(float3, color)
NODE_SOCKET_API(float, strength)
NODE_SOCKET_API(float, surface_mix_weight)
@@ -800,10 +754,6 @@ class BackgroundNode : public ShaderNode {
class HoldoutNode : public ShaderNode {
public:
SHADER_NODE_CLASS(HoldoutNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
virtual ClosureType get_closure_type()
{
return CLOSURE_HOLDOUT_ID;
@@ -821,13 +771,9 @@ class AmbientOcclusionNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
- virtual bool has_raytrace()
+ virtual int get_feature()
{
- return true;
+ return KERNEL_FEATURE_NODE_RAYTRACE;
}
NODE_SOCKET_API(float3, color)
@@ -845,13 +791,9 @@ class VolumeNode : public ShaderNode {
SHADER_NODE_BASE_CLASS(VolumeNode)
void compile(SVMCompiler &compiler, ShaderInput *param1, ShaderInput *param2);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
virtual int get_feature()
{
- return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_VOLUME;
}
virtual ClosureType get_closure_type()
{
@@ -1013,10 +955,6 @@ class UVMapNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API(ustring, attribute)
NODE_SOCKET_API(bool, from_dupli)
@@ -1025,10 +963,6 @@ class UVMapNode : public ShaderNode {
class LightPathNode : public ShaderNode {
public:
SHADER_NODE_CLASS(LightPathNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
};
class LightFalloffNode : public ShaderNode {
@@ -1038,10 +972,6 @@ class LightFalloffNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
NODE_SOCKET_API(float, strength)
NODE_SOCKET_API(float, smooth)
@@ -1050,10 +980,6 @@ class LightFalloffNode : public ShaderNode {
class ObjectInfoNode : public ShaderNode {
public:
SHADER_NODE_CLASS(ObjectInfoNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
};
class ParticleInfoNode : public ShaderNode {
@@ -1064,10 +990,6 @@ class ParticleInfoNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
};
class HairInfoNode : public ShaderNode {
@@ -1083,13 +1005,9 @@ class HairInfoNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
virtual int get_feature()
{
- return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+ return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR;
}
};
@@ -1168,10 +1086,6 @@ class InvertNode : public ShaderNode {
public:
SHADER_NODE_CLASS(InvertNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, fac)
NODE_SOCKET_API(float3, color)
@@ -1182,11 +1096,6 @@ class MixNode : public ShaderNode {
SHADER_NODE_CLASS(MixNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
-
NODE_SOCKET_API(NodeMix, mix_type)
NODE_SOCKET_API(bool, use_clamp)
NODE_SOCKET_API(float3, color1)
@@ -1198,10 +1107,6 @@ class CombineRGBNode : public ShaderNode {
public:
SHADER_NODE_CLASS(CombineRGBNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, r)
NODE_SOCKET_API(float, g)
@@ -1212,10 +1117,6 @@ class CombineHSVNode : public ShaderNode {
public:
SHADER_NODE_CLASS(CombineHSVNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, h)
NODE_SOCKET_API(float, s)
@@ -1226,10 +1127,6 @@ class CombineXYZNode : public ShaderNode {
public:
SHADER_NODE_CLASS(CombineXYZNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, x)
NODE_SOCKET_API(float, y)
@@ -1240,10 +1137,6 @@ class GammaNode : public ShaderNode {
public:
SHADER_NODE_CLASS(GammaNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API(float3, color)
NODE_SOCKET_API(float, gamma)
@@ -1253,10 +1146,6 @@ class BrightContrastNode : public ShaderNode {
public:
SHADER_NODE_CLASS(BrightContrastNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API(float3, color)
NODE_SOCKET_API(float, bright)
@@ -1267,10 +1156,6 @@ class SeparateRGBNode : public ShaderNode {
public:
SHADER_NODE_CLASS(SeparateRGBNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float3, color)
};
@@ -1279,10 +1164,6 @@ class SeparateHSVNode : public ShaderNode {
public:
SHADER_NODE_CLASS(SeparateHSVNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float3, color)
};
@@ -1291,10 +1172,6 @@ class SeparateXYZNode : public ShaderNode {
public:
SHADER_NODE_CLASS(SeparateXYZNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float3, vector)
};
@@ -1333,10 +1210,6 @@ class CameraNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
};
class FresnelNode : public ShaderNode {
@@ -1346,10 +1219,6 @@ class FresnelNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API(float3, normal)
NODE_SOCKET_API(float, IOR)
@@ -1362,10 +1231,6 @@ class LayerWeightNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API(float3, normal)
NODE_SOCKET_API(float, blend)
@@ -1378,10 +1243,6 @@ class WireframeNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, size)
NODE_SOCKET_API(bool, use_pixel_size)
@@ -1390,10 +1251,6 @@ class WireframeNode : public ShaderNode {
class WavelengthNode : public ShaderNode {
public:
SHADER_NODE_CLASS(WavelengthNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, wavelength)
};
@@ -1402,10 +1259,6 @@ class BlackbodyNode : public ShaderNode {
public:
SHADER_NODE_CLASS(BlackbodyNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, temperature)
};
@@ -1413,10 +1266,6 @@ class BlackbodyNode : public ShaderNode {
class MapRangeNode : public ShaderNode {
public:
SHADER_NODE_CLASS(MapRangeNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
void expand(ShaderGraph *graph);
NODE_SOCKET_API(float, value)
@@ -1433,10 +1282,6 @@ class ClampNode : public ShaderNode {
public:
SHADER_NODE_CLASS(ClampNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(float, value)
NODE_SOCKET_API(float, min)
NODE_SOCKET_API(float, max)
@@ -1446,10 +1291,6 @@ class ClampNode : public ShaderNode {
class MathNode : public ShaderNode {
public:
SHADER_NODE_CLASS(MathNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
void expand(ShaderGraph *graph);
void constant_fold(const ConstantFolder &folder);
@@ -1463,10 +1304,6 @@ class MathNode : public ShaderNode {
class NormalNode : public ShaderNode {
public:
SHADER_NODE_CLASS(NormalNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_2;
- }
NODE_SOCKET_API(float3, direction)
NODE_SOCKET_API(float3, normal)
@@ -1475,10 +1312,6 @@ class NormalNode : public ShaderNode {
class VectorMathNode : public ShaderNode {
public:
SHADER_NODE_CLASS(VectorMathNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
void constant_fold(const ConstantFolder &folder);
NODE_SOCKET_API(float3, vector1)
@@ -1492,10 +1325,6 @@ class VectorRotateNode : public ShaderNode {
public:
SHADER_NODE_CLASS(VectorRotateNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(NodeVectorRotateType, rotate_type)
NODE_SOCKET_API(bool, invert)
NODE_SOCKET_API(float3, vector)
@@ -1509,11 +1338,6 @@ class VectorTransformNode : public ShaderNode {
public:
SHADER_NODE_CLASS(VectorTransformNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
-
NODE_SOCKET_API(NodeVectorTransformType, transform_type)
NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_from)
NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_to)
@@ -1530,7 +1354,7 @@ class BumpNode : public ShaderNode {
}
virtual int get_feature()
{
- return NODE_FEATURE_BUMP;
+ return KERNEL_FEATURE_NODE_BUMP;
}
NODE_SOCKET_API(bool, invert)
@@ -1549,11 +1373,6 @@ class CurvesNode : public ShaderNode {
explicit CurvesNode(const NodeType *node_type);
SHADER_NODE_BASE_CLASS(CurvesNode)
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
-
NODE_SOCKET_API_ARRAY(array<float3>, curves)
NODE_SOCKET_API(float, min_x)
NODE_SOCKET_API(float, max_x)
@@ -1583,10 +1402,6 @@ class RGBRampNode : public ShaderNode {
public:
SHADER_NODE_CLASS(RGBRampNode)
void constant_fold(const ConstantFolder &folder);
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_1;
- }
NODE_SOCKET_API_ARRAY(array<float3>, ramp)
NODE_SOCKET_API_ARRAY(array<float>, ramp_alpha)
@@ -1656,10 +1471,6 @@ class NormalMapNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(NodeNormalMapSpace, space)
NODE_SOCKET_API(ustring, attribute)
@@ -1680,10 +1491,6 @@ class TangentNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
NODE_SOCKET_API(NodeTangentDirectionType, direction_type)
NODE_SOCKET_API(NodeTangentAxis, axis)
@@ -1698,13 +1505,9 @@ class BevelNode : public ShaderNode {
{
return true;
}
- virtual int get_group()
- {
- return NODE_GROUP_LEVEL_3;
- }
- virtual bool has_raytrace()
+ virtual int get_feature()
{
- return true;
+ return KERNEL_FEATURE_NODE_RAYTRACE;
}
NODE_SOCKET_API(float, radius)
@@ -1718,7 +1521,7 @@ class DisplacementNode : public ShaderNode {
void constant_fold(const ConstantFolder &folder);
virtual int get_feature()
{
- return NODE_FEATURE_BUMP;
+ return KERNEL_FEATURE_NODE_BUMP;
}
NODE_SOCKET_API(NodeNormalMapSpace, space)
@@ -1739,7 +1542,7 @@ class VectorDisplacementNode : public ShaderNode {
void constant_fold(const ConstantFolder &folder);
virtual int get_feature()
{
- return NODE_FEATURE_BUMP;
+ return KERNEL_FEATURE_NODE_BUMP;
}
NODE_SOCKET_API(NodeNormalMapSpace, space)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index c88d94fe4c2..4637f8fe989 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -216,6 +216,10 @@ void Object::tag_update(Scene *scene)
if (use_holdout_is_modified()) {
flag |= ObjectManager::HOLDOUT_MODIFIED;
}
+
+ if (is_shadow_catcher_is_modified()) {
+ scene->tag_shadow_catcher_modified();
+ }
}
if (geometry) {
@@ -273,14 +277,7 @@ bool Object::is_traceable() const
uint Object::visibility_for_tracing() const
{
- uint trace_visibility = visibility;
- if (is_shadow_catcher) {
- trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
- }
- else {
- trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
- }
- return trace_visibility;
+ return SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility & PATH_RAY_ALL_VISIBILITY);
}
float Object::compute_volume_step_size() const
@@ -680,7 +677,7 @@ void ObjectManager::device_update(Device *device,
/* prepare for static BVH building */
/* todo: do before to support getting object level coords? */
- if (scene->params.bvh_type == SceneParams::BVH_STATIC) {
+ if (scene->params.bvh_type == BVH_TYPE_STATIC) {
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->object.times.add_entry(
@@ -932,6 +929,11 @@ void ObjectManager::tag_update(Scene *scene, uint32_t flag)
}
scene->light_manager->tag_update(scene, LightManager::OBJECT_MANAGER);
+
+ /* Integrator's shadow catcher settings depends on object visibility settings. */
+ if (flag & (OBJECT_ADDED | OBJECT_REMOVED | OBJECT_MODIFIED)) {
+ scene->integrator->tag_update(scene, Integrator::OBJECT_MANAGER);
+ }
}
bool ObjectManager::need_update() const
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 7dc79f48145..d28b222c10e 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -113,7 +113,7 @@ void OSLShaderManager::device_update_specific(Device *device,
scene->image_manager->set_osl_texture_system((void *)ts);
/* create shaders */
- OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+ OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
Shader *background_shader = scene->background->get_shader(scene);
foreach (Shader *shader, scene->shaders) {
@@ -174,7 +174,7 @@ void OSLShaderManager::device_update_specific(Device *device,
void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
{
- OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+ OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
device_free_common(device, dscene, scene);
@@ -257,25 +257,36 @@ void OSLShaderManager::shading_system_init()
/* our own ray types */
static const char *raytypes[] = {
- "camera", /* PATH_RAY_CAMERA */
- "reflection", /* PATH_RAY_REFLECT */
- "refraction", /* PATH_RAY_TRANSMIT */
- "diffuse", /* PATH_RAY_DIFFUSE */
- "glossy", /* PATH_RAY_GLOSSY */
- "singular", /* PATH_RAY_SINGULAR */
- "transparent", /* PATH_RAY_TRANSPARENT */
-
- "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
- "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */
- "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
- "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
-
- "__unused__", "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
- "__unused__",
-
- "__unused__", "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
- "__unused__", "__unused__", "__unused__", "__unused__",
- "__unused__", "__unused__", "__unused__",
+ "camera", /* PATH_RAY_CAMERA */
+ "reflection", /* PATH_RAY_REFLECT */
+ "refraction", /* PATH_RAY_TRANSMIT */
+ "diffuse", /* PATH_RAY_DIFFUSE */
+ "glossy", /* PATH_RAY_GLOSSY */
+ "singular", /* PATH_RAY_SINGULAR */
+ "transparent", /* PATH_RAY_TRANSPARENT */
+ "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+ "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+ "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+ "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+ "__unused__", /* PATH_RAY_MIS_SKIP */
+
+ "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+ "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */
+ "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */
+ "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */
+ "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */
+ "__unused__", /* PATH_RAY_EMISSION */
+ "__unused__", /* PATH_RAY_SUBSURFACE */
+ "__unused__", /* PATH_RAY_DENOISING_FEATURES */
+ "__unused__", /* PATH_RAY_REFLECT_PASS */
+ "__unused__", /* PATH_RAY_TRANSMISSION_PASS */
+ "__unused__", /* PATH_RAY_VOLUME_PASS */
+ "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */
+ "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */
+ "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */
};
const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
@@ -758,7 +769,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
current_shader->has_surface_bssrdf = true;
current_shader->has_bssrdf_bump = true; /* can't detect yet */
}
- current_shader->has_bump = true; /* can't detect yet */
+ current_shader->has_bump = true; /* can't detect yet */
+ current_shader->has_surface_raytrace = true; /* can't detect yet */
}
if (node->has_spatial_varying()) {
@@ -1054,6 +1066,8 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes)
current_shader->has_surface_emission = true;
if (node->has_surface_transparent())
current_shader->has_surface_transparent = true;
+ if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+ current_shader->has_surface_raytrace = true;
if (node->has_spatial_varying())
current_shader->has_surface_spatial_varying = true;
if (node->has_surface_bssrdf()) {
diff --git a/intern/cycles/render/pass.cpp b/intern/cycles/render/pass.cpp
new file mode 100644
index 00000000000..27ad7c0db97
--- /dev/null
+++ b/intern/cycles/render/pass.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/pass.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type)
+{
+ const int type_int = static_cast<int>(type);
+
+ const NodeEnum *type_enum = Pass::get_type_enum();
+
+ if (!type_enum->exists(type_int)) {
+ LOG(DFATAL) << "Unhandled pass type " << static_cast<int>(type) << ", not supposed to happen.";
+ return "UNKNOWN";
+ }
+
+ return (*type_enum)[type_int].c_str();
+}
+
+const char *pass_mode_as_string(PassMode mode)
+{
+ switch (mode) {
+ case PassMode::NOISY:
+ return "NOISY";
+ case PassMode::DENOISED:
+ return "DENOISED";
+ }
+
+ LOG(DFATAL) << "Unhandled pass mode " << static_cast<int>(mode) << ", should never happen.";
+ return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, PassMode mode)
+{
+ os << pass_mode_as_string(mode);
+ return os;
+}
+
+const NodeEnum *Pass::get_type_enum()
+{
+ static NodeEnum pass_type_enum;
+
+ if (pass_type_enum.empty()) {
+
+ /* Light Passes. */
+ pass_type_enum.insert("combined", PASS_COMBINED);
+ pass_type_enum.insert("emission", PASS_EMISSION);
+ pass_type_enum.insert("background", PASS_BACKGROUND);
+ pass_type_enum.insert("ao", PASS_AO);
+ pass_type_enum.insert("shadow", PASS_SHADOW);
+ pass_type_enum.insert("diffuse", PASS_DIFFUSE);
+ pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
+ pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
+ pass_type_enum.insert("glossy", PASS_GLOSSY);
+ pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
+ pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
+ pass_type_enum.insert("transmission", PASS_TRANSMISSION);
+ pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
+ pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
+ pass_type_enum.insert("volume", PASS_VOLUME);
+ pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
+ pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
+
+ /* Data passes. */
+ pass_type_enum.insert("depth", PASS_DEPTH);
+ pass_type_enum.insert("position", PASS_POSITION);
+ pass_type_enum.insert("normal", PASS_NORMAL);
+ pass_type_enum.insert("roughness", PASS_ROUGHNESS);
+ pass_type_enum.insert("uv", PASS_UV);
+ pass_type_enum.insert("object_id", PASS_OBJECT_ID);
+ pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
+ pass_type_enum.insert("motion", PASS_MOTION);
+ pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
+ pass_type_enum.insert("render_time", PASS_RENDER_TIME);
+ pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
+ pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
+ pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
+ pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
+ pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
+ pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
+ pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
+ pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
+ pass_type_enum.insert("mist", PASS_MIST);
+ pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
+ pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
+
+ pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
+ pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+ pass_type_enum.insert("shadow_catcher_matte", PASS_SHADOW_CATCHER_MATTE);
+
+ pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
+ pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
+ }
+
+ return &pass_type_enum;
+}
+
+const NodeEnum *Pass::get_mode_enum()
+{
+ static NodeEnum pass_mode_enum;
+
+ if (pass_mode_enum.empty()) {
+ pass_mode_enum.insert("noisy", static_cast<int>(PassMode::NOISY));
+ pass_mode_enum.insert("denoised", static_cast<int>(PassMode::DENOISED));
+ }
+
+ return &pass_mode_enum;
+}
+
+NODE_DEFINE(Pass)
+{
+ NodeType *type = NodeType::add("pass", create);
+
+ const NodeEnum *pass_type_enum = get_type_enum();
+ const NodeEnum *pass_mode_enum = get_mode_enum();
+
+ SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+ SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+ SOCKET_STRING(name, "Name", ustring());
+ SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
+
+ return type;
+}
+
+Pass::Pass() : Node(get_node_type()), is_auto_(false)
+{
+}
+
+PassInfo Pass::get_info() const
+{
+ return get_info(type, include_albedo);
+}
+
+bool Pass::is_written() const
+{
+ return get_info().is_written;
+}
+
+PassInfo Pass::get_info(const PassType type, const bool include_albedo)
+{
+ PassInfo pass_info;
+
+ pass_info.use_filter = true;
+ pass_info.use_exposure = false;
+ pass_info.divide_type = PASS_NONE;
+ pass_info.use_compositing = false;
+ pass_info.use_denoising_albedo = true;
+
+ switch (type) {
+ case PASS_NONE:
+ pass_info.num_components = 0;
+ break;
+ case PASS_COMBINED:
+ pass_info.num_components = 4;
+ pass_info.use_exposure = true;
+ pass_info.support_denoise = true;
+ break;
+ case PASS_DEPTH:
+ pass_info.num_components = 1;
+ pass_info.use_filter = false;
+ break;
+ case PASS_MIST:
+ pass_info.num_components = 1;
+ break;
+ case PASS_POSITION:
+ pass_info.num_components = 3;
+ break;
+ case PASS_NORMAL:
+ pass_info.num_components = 3;
+ break;
+ case PASS_ROUGHNESS:
+ pass_info.num_components = 1;
+ break;
+ case PASS_UV:
+ pass_info.num_components = 3;
+ break;
+ case PASS_MOTION:
+ pass_info.num_components = 4;
+ pass_info.divide_type = PASS_MOTION_WEIGHT;
+ break;
+ case PASS_MOTION_WEIGHT:
+ pass_info.num_components = 1;
+ break;
+ case PASS_OBJECT_ID:
+ case PASS_MATERIAL_ID:
+ pass_info.num_components = 1;
+ pass_info.use_filter = false;
+ break;
+
+ case PASS_EMISSION:
+ case PASS_BACKGROUND:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ break;
+ case PASS_AO:
+ pass_info.num_components = 3;
+ break;
+ case PASS_SHADOW:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = false;
+ break;
+ case PASS_RENDER_TIME:
+ /* This pass is handled entirely on the host side. */
+ pass_info.num_components = 0;
+ break;
+
+ case PASS_DIFFUSE_COLOR:
+ case PASS_GLOSSY_COLOR:
+ case PASS_TRANSMISSION_COLOR:
+ pass_info.num_components = 3;
+ break;
+ case PASS_DIFFUSE:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.direct_type = PASS_DIFFUSE_DIRECT;
+ pass_info.indirect_type = PASS_DIFFUSE_INDIRECT;
+ pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ pass_info.is_written = false;
+ break;
+ case PASS_DIFFUSE_DIRECT:
+ case PASS_DIFFUSE_INDIRECT:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ break;
+ case PASS_GLOSSY:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.direct_type = PASS_GLOSSY_DIRECT;
+ pass_info.indirect_type = PASS_GLOSSY_INDIRECT;
+ pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ pass_info.is_written = false;
+ break;
+ case PASS_GLOSSY_DIRECT:
+ case PASS_GLOSSY_INDIRECT:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ break;
+ case PASS_TRANSMISSION:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.direct_type = PASS_TRANSMISSION_DIRECT;
+ pass_info.indirect_type = PASS_TRANSMISSION_INDIRECT;
+ pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ pass_info.is_written = false;
+ break;
+ case PASS_TRANSMISSION_DIRECT:
+ case PASS_TRANSMISSION_INDIRECT:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+ pass_info.use_compositing = true;
+ break;
+ case PASS_VOLUME:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.direct_type = PASS_VOLUME_DIRECT;
+ pass_info.indirect_type = PASS_VOLUME_INDIRECT;
+ pass_info.use_compositing = true;
+ pass_info.is_written = false;
+ break;
+ case PASS_VOLUME_DIRECT:
+ case PASS_VOLUME_INDIRECT:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ break;
+
+ case PASS_CRYPTOMATTE:
+ pass_info.num_components = 4;
+ break;
+
+ case PASS_DENOISING_NORMAL:
+ pass_info.num_components = 3;
+ break;
+ case PASS_DENOISING_ALBEDO:
+ pass_info.num_components = 3;
+ break;
+
+ case PASS_SHADOW_CATCHER:
+ pass_info.num_components = 3;
+ pass_info.use_exposure = true;
+ pass_info.use_compositing = true;
+ pass_info.use_denoising_albedo = false;
+ pass_info.support_denoise = true;
+ break;
+ case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+ pass_info.num_components = 1;
+ break;
+ case PASS_SHADOW_CATCHER_MATTE:
+ pass_info.num_components = 4;
+ pass_info.use_exposure = true;
+ pass_info.support_denoise = true;
+ /* Without shadow catcher approximation compositing is not needed.
+ * Since we don't know here whether approximation is used or not, leave the decision up to
+ * the caller which will know that. */
+ break;
+
+ case PASS_ADAPTIVE_AUX_BUFFER:
+ pass_info.num_components = 4;
+ break;
+ case PASS_SAMPLE_COUNT:
+ pass_info.num_components = 1;
+ pass_info.use_exposure = false;
+ break;
+
+ case PASS_AOV_COLOR:
+ pass_info.num_components = 3;
+ break;
+ case PASS_AOV_VALUE:
+ pass_info.num_components = 1;
+ break;
+
+ case PASS_BAKE_PRIMITIVE:
+ case PASS_BAKE_DIFFERENTIAL:
+ pass_info.num_components = 4;
+ pass_info.use_exposure = false;
+ pass_info.use_filter = false;
+ break;
+
+ case PASS_CATEGORY_LIGHT_END:
+ case PASS_CATEGORY_DATA_END:
+ case PASS_CATEGORY_BAKE_END:
+ case PASS_NUM:
+ LOG(DFATAL) << "Unexpected pass type is used " << type;
+ pass_info.num_components = 0;
+ break;
+ }
+
+ return pass_info;
+}
+
+bool Pass::contains(const vector<Pass *> &passes, PassType type)
+{
+ for (const Pass *pass : passes) {
+ if (pass->get_type() != type) {
+ continue;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, const string &name)
+{
+ for (const Pass *pass : passes) {
+ if (pass->get_name() == name) {
+ return pass;
+ }
+ }
+
+ return nullptr;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, PassType type, PassMode mode)
+{
+ for (const Pass *pass : passes) {
+ if (pass->get_type() != type || pass->get_mode() != mode) {
+ continue;
+ }
+
+ return pass;
+ }
+
+ return nullptr;
+}
+
+int Pass::get_offset(const vector<Pass *> &passes, const Pass *pass)
+{
+ int pass_offset = 0;
+
+ for (const Pass *current_pass : passes) {
+ /* Note that pass name is allowed to be empty. This is why we check for type and mode. */
+ if (current_pass->get_type() == pass->get_type() &&
+ current_pass->get_mode() == pass->get_mode() &&
+ current_pass->get_name() == pass->get_name()) {
+ if (current_pass->is_written()) {
+ return pass_offset;
+ }
+ else {
+ return PASS_UNUSED;
+ }
+ }
+ if (current_pass->is_written()) {
+ pass_offset += current_pass->get_info().num_components;
+ }
+ }
+
+ return PASS_UNUSED;
+}
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass)
+{
+ os << "type: " << pass_type_as_string(pass.get_type());
+ os << ", name: \"" << pass.get_name() << "\"";
+ os << ", mode: " << pass.get_mode();
+ os << ", is_written: " << string_from_bool(pass.is_written());
+
+ return os;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/pass.h b/intern/cycles/render/pass.h
new file mode 100644
index 00000000000..82230c62cb0
--- /dev/null
+++ b/intern/cycles/render/pass.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream> // NOLINT
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+#include "kernel/kernel_types.h"
+
+#include "graph/node.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type);
+
+enum class PassMode {
+ NOISY,
+ DENOISED,
+};
+const char *pass_mode_as_string(PassMode mode);
+std::ostream &operator<<(std::ostream &os, PassMode mode);
+
+struct PassInfo {
+ int num_components = -1;
+ bool use_filter = false;
+ bool use_exposure = false;
+ bool is_written = true;
+ PassType divide_type = PASS_NONE;
+ PassType direct_type = PASS_NONE;
+ PassType indirect_type = PASS_NONE;
+
+ /* Pass access for read can not happen directly and needs some sort of compositing (for example,
+ * light passes due to divide_type, or shadow catcher pass. */
+ bool use_compositing = false;
+
+ /* Used to disable albedo pass for denoising.
+ * Light and shadow catcher passes should not have discontinuity in the denoised result based on
+ * the underlying albedo. */
+ bool use_denoising_albedo = true;
+
+ /* Pass supports denoising. */
+ bool support_denoise = false;
+};
+
+class Pass : public Node {
+ public:
+ NODE_DECLARE
+
+ NODE_SOCKET_API(PassType, type)
+ NODE_SOCKET_API(PassMode, mode)
+ NODE_SOCKET_API(ustring, name)
+ NODE_SOCKET_API(bool, include_albedo)
+
+ Pass();
+
+ PassInfo get_info() const;
+
+ /* The pass is written by the render pipeline (kernel or denoiser). If the pass is written it
+ * will have pixels allocated in a RenderBuffer. Passes which are not written do not have their
+ * pixels allocated to save memory. */
+ bool is_written() const;
+
+ protected:
+ /* The has been created automatically as a requirement to various rendering functionality (such
+ * as adaptive sampling). */
+ bool is_auto_;
+
+ public:
+ static const NodeEnum *get_type_enum();
+ static const NodeEnum *get_mode_enum();
+
+ static PassInfo get_info(PassType type, const bool include_albedo = false);
+
+ static bool contains(const vector<Pass *> &passes, PassType type);
+
+ /* Returns nullptr if there is no pass with the given name or type+mode. */
+ static const Pass *find(const vector<Pass *> &passes, const string &name);
+ static const Pass *find(const vector<Pass *> &passes,
+ PassType type,
+ PassMode mode = PassMode::NOISY);
+
+ /* Returns PASS_UNUSED if there is no corresponding pass. */
+ static int get_offset(const vector<Pass *> &passes, const Pass *pass);
+
+ friend class Film;
+};
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index c4e7d2c79d6..a4b030190dc 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -163,12 +163,15 @@ void Scene::free_memory(bool final)
delete p;
foreach (Light *l, lights)
delete l;
+ foreach (Pass *p, passes)
+ delete p;
geometry.clear();
objects.clear();
lights.clear();
particle_systems.clear();
procedurals.clear();
+ passes.clear();
if (device) {
camera->device_free(device, &dscene, this);
@@ -253,7 +256,6 @@ void Scene::device_update(Device *device_, Progress &progress)
* - Camera may be used for adaptive subdivision.
* - Displacement shader must have all shader data available.
* - Light manager needs lookup tables and final mesh data to compute emission CDF.
- * - Film needs light manager to run for use_light_visibility
* - Lookup tables are done a second time to handle film tables
*/
@@ -469,88 +471,110 @@ void Scene::enable_update_stats()
}
}
-DeviceRequestedFeatures Scene::get_requested_device_features()
+void Scene::update_kernel_features()
{
- DeviceRequestedFeatures requested_features;
+ if (!need_update()) {
+ return;
+ }
- shader_manager->get_requested_features(this, &requested_features);
+ /* These features are not being tweaked as often as shaders,
+ * so could be done selective magic for the viewport as well. */
+ uint kernel_features = shader_manager->get_kernel_features(this);
- /* This features are not being tweaked as often as shaders,
- * so could be done selective magic for the viewport as well.
- */
bool use_motion = need_motion() == Scene::MotionType::MOTION_BLUR;
- requested_features.use_hair = false;
- requested_features.use_hair_thick = (params.hair_shape == CURVE_THICK);
- requested_features.use_object_motion = false;
- requested_features.use_camera_motion = use_motion && camera->use_motion();
+ kernel_features |= KERNEL_FEATURE_PATH_TRACING;
+ if (params.hair_shape == CURVE_THICK) {
+ kernel_features |= KERNEL_FEATURE_HAIR_THICK;
+ }
+ if (use_motion && camera->use_motion()) {
+ kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+ }
foreach (Object *object, objects) {
Geometry *geom = object->get_geometry();
if (use_motion) {
- requested_features.use_object_motion |= object->use_motion() | geom->get_use_motion_blur();
- requested_features.use_camera_motion |= geom->get_use_motion_blur();
+ if (object->use_motion() || geom->get_use_motion_blur()) {
+ kernel_features |= KERNEL_FEATURE_OBJECT_MOTION;
+ }
+ if (geom->get_use_motion_blur()) {
+ kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+ }
}
if (object->get_is_shadow_catcher()) {
- requested_features.use_shadow_tricks = true;
+ kernel_features |= KERNEL_FEATURE_SHADOW_CATCHER;
}
if (geom->is_mesh()) {
Mesh *mesh = static_cast<Mesh *>(geom);
#ifdef WITH_OPENSUBDIV
if (mesh->get_subdivision_type() != Mesh::SUBDIVISION_NONE) {
- requested_features.use_patch_evaluation = true;
+ kernel_features |= KERNEL_FEATURE_PATCH_EVALUATION;
}
#endif
- requested_features.use_true_displacement |= mesh->has_true_displacement();
}
else if (geom->is_hair()) {
- requested_features.use_hair = true;
+ kernel_features |= KERNEL_FEATURE_HAIR;
}
}
- requested_features.use_background_light = light_manager->has_background_light(this);
-
- requested_features.use_baking = bake_manager->get_baking();
- requested_features.use_integrator_branched = (integrator->get_method() ==
- Integrator::BRANCHED_PATH);
- if (film->get_denoising_data_pass()) {
- requested_features.use_denoising = true;
- requested_features.use_shadow_tricks = true;
+ if (bake_manager->get_baking()) {
+ kernel_features |= KERNEL_FEATURE_BAKING;
}
- return requested_features;
-}
+ kernel_features |= film->get_kernel_features(this);
-bool Scene::update(Progress &progress, bool &kernel_switch_needed)
-{
- /* update scene */
- if (need_update()) {
- /* Update max_closures. */
- KernelIntegrator *kintegrator = &dscene.data.integrator;
- if (params.background) {
- kintegrator->max_closures = get_max_closure_count();
- }
- else {
- /* Currently viewport render is faster with higher max_closures, needs investigating. */
- kintegrator->max_closures = MAX_CLOSURE;
- }
-
- /* Load render kernels, before device update where we upload data to the GPU. */
- bool new_kernels_needed = load_kernels(progress, false);
-
- progress.set_status("Updating Scene");
- MEM_GUARDED_CALL(&progress, device_update, device, progress);
+ dscene.data.kernel_features = kernel_features;
- DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state();
- kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE ||
- kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
- if (new_kernels_needed || kernel_switch_needed) {
- progress.set_kernel_status("Compiling render kernels");
- device->wait_for_availability(loaded_kernel_features);
- progress.set_kernel_status("");
- }
+ /* Currently viewport render is faster with higher max_closures, needs investigating. */
+ const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
+ dscene.data.max_closures = max_closures;
+ dscene.data.max_shaders = shaders.size();
+}
- return true;
+bool Scene::update(Progress &progress)
+{
+ if (!need_update()) {
+ return false;
}
- return false;
+
+ /* Load render kernels, before device update where we upload data to the GPU. */
+ load_kernels(progress, false);
+
+ /* Upload scene data to the GPU. */
+ progress.set_status("Updating Scene");
+ MEM_GUARDED_CALL(&progress, device_update, device, progress);
+
+ return true;
+}
+
+static void log_kernel_features(const uint features)
+{
+ VLOG(2) << "Requested features:\n";
+ VLOG(2) << "Use BSDF " << string_from_bool(features & KERNEL_FEATURE_NODE_BSDF) << "\n";
+ VLOG(2) << "Use Principled BSDF " << string_from_bool(features & KERNEL_FEATURE_PRINCIPLED)
+ << "\n";
+ VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n";
+ VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n";
+ VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n";
+ VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n";
+ VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+ << "\n";
+ VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE)
+ << "\n";
+ VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n";
+ VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n";
+ VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING)
+ << "\n";
+ VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_HAIR) << "\n";
+ VLOG(2) << "Use Object Motion " << string_from_bool(features & KERNEL_FEATURE_OBJECT_MOTION)
+ << "\n";
+ VLOG(2) << "Use Camera Motion " << string_from_bool(features & KERNEL_FEATURE_CAMERA_MOTION)
+ << "\n";
+ VLOG(2) << "Use Baking " << string_from_bool(features & KERNEL_FEATURE_BAKING) << "\n";
+ VLOG(2) << "Use Subsurface " << string_from_bool(features & KERNEL_FEATURE_SUBSURFACE) << "\n";
+ VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_VOLUME) << "\n";
+ VLOG(2) << "Use Patch Evaluation "
+ << string_from_bool(features & KERNEL_FEATURE_PATCH_EVALUATION) << "\n";
+ VLOG(2) << "Use Shadow Catcher " << string_from_bool(features & KERNEL_FEATURE_SHADOW_CATCHER)
+ << "\n";
}
bool Scene::load_kernels(Progress &progress, bool lock_scene)
@@ -560,15 +584,15 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
scene_lock = thread_scoped_lock(mutex);
}
- DeviceRequestedFeatures requested_features = get_requested_device_features();
+ const uint kernel_features = dscene.data.kernel_features;
- if (!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
+ if (!kernels_loaded || loaded_kernel_features != kernel_features) {
progress.set_status("Loading render kernels (may take a few minutes the first time)");
scoped_timer timer;
- VLOG(2) << "Requested features:\n" << requested_features;
- if (!device->load_kernels(requested_features)) {
+ log_kernel_features(kernel_features);
+ if (!device->load_kernels(kernel_features)) {
string message = device->error_message();
if (message.empty())
message = "Failed loading render kernel, see console for errors";
@@ -580,7 +604,7 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
}
kernels_loaded = true;
- loaded_kernel_features = requested_features;
+ loaded_kernel_features = kernel_features;
return true;
}
return false;
@@ -618,6 +642,28 @@ int Scene::get_max_closure_count()
return max_closure_global;
}
+bool Scene::has_shadow_catcher()
+{
+ if (shadow_catcher_modified_) {
+ has_shadow_catcher_ = false;
+ for (Object *object : objects) {
+ if (object->get_is_shadow_catcher()) {
+ has_shadow_catcher_ = true;
+ break;
+ }
+ }
+
+ shadow_catcher_modified_ = false;
+ }
+
+ return has_shadow_catcher_;
+}
+
+void Scene::tag_shadow_catcher_modified()
+{
+ shadow_catcher_modified_ = true;
+}
+
template<> Light *Scene::create_node<Light>()
{
Light *node = new Light();
@@ -694,6 +740,15 @@ template<> AlembicProcedural *Scene::create_node<AlembicProcedural>()
#endif
}
+template<> Pass *Scene::create_node<Pass>()
+{
+ Pass *node = new Pass();
+ node->set_owner(this);
+ passes.push_back(node);
+ film->tag_modified();
+ return node;
+}
+
template<typename T> void delete_node_from_array(vector<T> &nodes, T node)
{
for (size_t i = 0; i < nodes.size(); ++i) {
@@ -779,6 +834,12 @@ template<> void Scene::delete_node_impl(AlembicProcedural *node)
#endif
}
+template<> void Scene::delete_node_impl(Pass *node)
+{
+ delete_node_from_array(passes, node);
+ film->tag_modified();
+}
+
template<typename T>
static void remove_nodes_in_set(const set<T *> &nodes_set,
vector<T *> &nodes_array,
@@ -842,4 +903,10 @@ template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOw
procedural_manager->tag_update();
}
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner)
+{
+ remove_nodes_in_set(nodes, passes, owner);
+ film->tag_modified();
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 7d8a6774381..cf4a3ba6b12 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -128,7 +128,7 @@ class DeviceScene {
device_vector<float> lookup_table;
/* integrator */
- device_vector<uint> sample_pattern_lut;
+ device_vector<float> sample_pattern_lut;
/* ies lights */
device_vector<float> ies_lights;
@@ -142,27 +142,6 @@ class DeviceScene {
class SceneParams {
public:
- /* Type of BVH, in terms whether it is supported dynamic updates of meshes
- * or whether modifying geometry requires full BVH rebuild.
- */
- enum BVHType {
- /* BVH supports dynamic updates of geometry.
- *
- * Faster for updating BVH tree when doing modifications in viewport,
- * but slower for rendering.
- */
- BVH_DYNAMIC = 0,
- /* BVH tree is calculated for specific scene, updates in geometry
- * requires full tree rebuild.
- *
- * Slower to update BVH tree when modifying objects in viewport, also
- * slower to build final BVH tree but gives best possible render speed.
- */
- BVH_STATIC = 1,
-
- BVH_NUM_TYPES,
- };
-
ShadingSystem shadingsystem;
/* Requested BVH layout.
@@ -186,7 +165,7 @@ class SceneParams {
{
shadingsystem = SHADINGSYSTEM_SVM;
bvh_layout = BVH_LAYOUT_BVH2;
- bvh_type = BVH_DYNAMIC;
+ bvh_type = BVH_TYPE_DYNAMIC;
use_bvh_spatial_split = false;
use_bvh_unaligned_nodes = true;
num_bvh_time_steps = 0;
@@ -196,7 +175,7 @@ class SceneParams {
background = true;
}
- bool modified(const SceneParams &params)
+ bool modified(const SceneParams &params) const
{
return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout &&
bvh_type == params.bvh_type &&
@@ -236,7 +215,7 @@ class Scene : public NodeOwner {
vector<Shader *> shaders;
vector<Light *> lights;
vector<ParticleSystem *> particle_systems;
- vector<Pass> passes;
+ vector<Pass *> passes;
vector<Procedural *> procedurals;
/* data managers */
@@ -291,7 +270,11 @@ class Scene : public NodeOwner {
void enable_update_stats();
- bool update(Progress &progress, bool &kernel_switch_needed);
+ void update_kernel_features();
+ bool update(Progress &progress);
+
+ bool has_shadow_catcher();
+ void tag_shadow_catcher_modified();
/* This function is used to create a node of a specified type instead of
* calling 'new', and sets the scene as the owner of the node.
@@ -348,13 +331,12 @@ class Scene : public NodeOwner {
void free_memory(bool final);
bool kernels_loaded;
- DeviceRequestedFeatures loaded_kernel_features;
+ uint loaded_kernel_features;
bool load_kernels(Progress &progress, bool lock_scene = true);
- /* ** Split kernel routines ** */
-
- DeviceRequestedFeatures get_requested_device_features();
+ bool has_shadow_catcher_ = false;
+ bool shadow_catcher_modified_ = true;
/* Maximum number of closure during session lifetime. */
int max_closure_global;
@@ -384,6 +366,8 @@ template<> Shader *Scene::create_node<Shader>();
template<> AlembicProcedural *Scene::create_node<AlembicProcedural>();
+template<> Pass *Scene::create_node<Pass>();
+
template<> void Scene::delete_node_impl(Light *node);
template<> void Scene::delete_node_impl(Mesh *node);
@@ -404,6 +388,8 @@ template<> void Scene::delete_node_impl(Procedural *node);
template<> void Scene::delete_node_impl(AlembicProcedural *node);
+template<> void Scene::delete_node_impl(Pass *node);
+
template<> void Scene::delete_nodes(const set<Light *> &nodes, const NodeOwner *owner);
template<> void Scene::delete_nodes(const set<Geometry *> &nodes, const NodeOwner *owner);
@@ -416,6 +402,8 @@ template<> void Scene::delete_nodes(const set<Shader *> &nodes, const NodeOwner
template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOwner *owner);
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner);
+
CCL_NAMESPACE_END
#endif /* __SCENE_H__ */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 1b91c49f0ea..47eeffd97fe 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,10 +17,15 @@
#include <limits.h>
#include <string.h>
+#include "device/cpu/device.h"
#include "device/device.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace.h"
+#include "render/background.h"
#include "render/bake.h"
#include "render/buffers.h"
#include "render/camera.h"
+#include "render/gpu_display.h"
#include "render/graph.h"
#include "render/integrator.h"
#include "render/light.h"
@@ -39,70 +44,63 @@
CCL_NAMESPACE_BEGIN
-/* Note about preserve_tile_device option for tile manager:
- * progressive refine and viewport rendering does requires tiles to
- * always be allocated for the same device
- */
-Session::Session(const SessionParams &params_)
- : params(params_),
- tile_manager(params.progressive,
- params.samples,
- params.tile_size,
- params.start_resolution,
- params.background == false || params.progressive_refine,
- params.background,
- params.tile_order,
- max(params.device.multi_devices.size(), 1),
- params.pixel_size),
- stats(),
- profiler()
+Session::Session(const SessionParams &params_, const SceneParams &scene_params)
+ : params(params_), render_scheduler_(tile_manager_, params)
{
- device_use_gl_ = ((params.device.type != DEVICE_CPU) && !params.background);
-
TaskScheduler::init(params.threads);
- session_thread_ = NULL;
- scene = NULL;
-
- reset_time_ = 0.0;
- last_update_time_ = 0.0;
+ session_thread_ = nullptr;
delayed_reset_.do_reset = false;
- delayed_reset_.samples = 0;
-
- display_outdated_ = false;
- gpu_draw_ready_ = false;
- gpu_need_display_buffer_update_ = false;
pause_ = false;
cancel_ = false;
new_work_added_ = false;
- buffers = NULL;
- display = NULL;
+ device = Device::create(params.device, stats, profiler);
- /* Validate denoising parameters. */
- set_denoising(params.denoising);
+ scene = new Scene(scene_params, device);
- /* Create CPU/GPU devices. */
- device = Device::create(params.device, stats, profiler, params.background);
-
- if (!device->error_message().empty()) {
- progress.set_error(device->error_message());
- return;
- }
+ /* Configure path tracer. */
+ path_trace_ = make_unique<PathTrace>(
+ device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
+ path_trace_->set_progress(&progress);
+ path_trace_->tile_buffer_update_cb = [&]() {
+ if (!update_render_tile_cb) {
+ return;
+ }
+ update_render_tile_cb();
+ };
+ path_trace_->tile_buffer_write_cb = [&]() {
+ if (!write_render_tile_cb) {
+ return;
+ }
+ write_render_tile_cb();
+ };
+ path_trace_->tile_buffer_read_cb = [&]() -> bool {
+ if (!read_render_tile_cb) {
+ return false;
+ }
+ read_render_tile_cb();
+ return true;
+ };
+ path_trace_->progress_update_cb = [&]() { update_status_time(); };
- /* Create buffers for interactive rendering. */
- if (!(params.background && !params.write_render_cb)) {
- buffers = new RenderBuffers(device);
- display = new DisplayBuffer(device, params.display_buffer_linear);
- }
+ tile_manager_.full_buffer_written_cb = [&](string_view filename) {
+ if (!full_buffer_written_cb) {
+ return;
+ }
+ full_buffer_written_cb(filename);
+ };
}
Session::~Session()
{
cancel();
+ /* TODO(sergey): Bring the passes in viewport back.
+ * It is unclear why there is such an exception needed though. */
+#if 0
if (buffers && params.write_render_cb) {
/* Copy to display buffer and write out image if requested */
delete display;
@@ -116,12 +114,14 @@ Session::~Session()
uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
params.write_render_cb((uchar *)pixels, w, h, 4);
}
+#endif
- /* clean up */
- tile_manager.device_free();
+ /* Make sure path tracer is destroyed before the device. This is needed because destruction might
+ * need to access device for device memory free. */
+ /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the
+ * pre-defined order. */
+ path_trace_.reset();
- delete buffers;
- delete display;
delete scene;
delete device;
@@ -135,15 +135,16 @@ void Session::start()
}
}
-void Session::cancel()
+void Session::cancel(bool quick)
{
+ if (quick && path_trace_) {
+ path_trace_->cancel();
+ }
+
if (session_thread_) {
/* wait for session thread to end */
progress.set_cancel("Exiting");
- gpu_need_display_buffer_update_ = false;
- gpu_need_display_buffer_update_cond_.notify_all();
-
{
thread_scoped_lock pause_lock(pause_mutex_);
pause_ = false;
@@ -157,570 +158,43 @@ void Session::cancel()
bool Session::ready_to_reset()
{
- double dt = time_dt() - reset_time_;
-
- if (!display_outdated_)
- return (dt > params.reset_timeout);
- else
- return (dt > params.cancel_timeout);
+ return path_trace_->ready_to_reset();
}
-/* GPU Session */
-
-void Session::reset_gpu(BufferParams &buffer_params, int samples)
+void Session::run_main_render_loop()
{
- thread_scoped_lock pause_lock(pause_mutex_);
-
- /* block for buffer access and reset immediately. we can't do this
- * in the thread, because we need to allocate an OpenGL buffer, and
- * that only works in the main thread */
- thread_scoped_lock display_lock(display_mutex_);
- thread_scoped_lock buffers_lock(buffers_mutex_);
+ path_trace_->clear_gpu_display();
- display_outdated_ = true;
- reset_time_ = time_dt();
+ while (true) {
+ RenderWork render_work = run_update_for_next_iteration();
- reset_(buffer_params, samples);
-
- gpu_need_display_buffer_update_ = false;
- gpu_need_display_buffer_update_cond_.notify_all();
-
- new_work_added_ = true;
-
- pause_cond_.notify_all();
-}
-
-bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
- /* block for buffer access */
- thread_scoped_lock display_lock(display_mutex_);
-
- /* first check we already rendered something */
- if (gpu_draw_ready_) {
- /* then verify the buffers have the expected size, so we don't
- * draw previous results in a resized window */
- if (buffer_params.width == display->params.width &&
- buffer_params.height == display->params.height) {
- /* for CUDA we need to do tone-mapping still, since we can
- * only access GL buffers from the main thread. */
- if (gpu_need_display_buffer_update_) {
- thread_scoped_lock buffers_lock(buffers_mutex_);
- copy_to_display_buffer(tile_manager.state.sample);
- gpu_need_display_buffer_update_ = false;
- gpu_need_display_buffer_update_cond_.notify_all();
+ if (!render_work) {
+ if (VLOG_IS_ON(2)) {
+ double total_time, render_time;
+ progress.get_time(total_time, render_time);
+ VLOG(2) << "Rendering in main loop is done in " << render_time << " seconds.";
+ VLOG(2) << path_trace_->full_report();
}
- display->draw(device, draw_params);
-
- if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
- return false;
-
- return true;
- }
- }
-
- return false;
-}
-
-void Session::run_gpu()
-{
- bool tiles_written = false;
-
- reset_time_ = time_dt();
- last_update_time_ = time_dt();
- last_display_time_ = last_update_time_;
-
- progress.set_render_start_time();
-
- while (!progress.get_cancel()) {
- const bool no_tiles = !run_update_for_next_iteration();
-
- if (no_tiles) {
if (params.background) {
- /* if no work left and in background mode, we can stop immediately */
+ /* if no work left and in background mode, we can stop immediately. */
progress.set_status("Finished");
break;
}
}
- if (run_wait_for_work(no_tiles)) {
- continue;
- }
-
- if (progress.get_cancel()) {
- break;
- }
-
- if (!no_tiles) {
- if (!device->error_message().empty())
- progress.set_error(device->error_message());
-
- if (progress.get_cancel())
- break;
-
- /* buffers mutex is locked entirely while rendering each
- * sample, and released/reacquired on each iteration to allow
- * reset and draw in between */
- thread_scoped_lock buffers_lock(buffers_mutex_);
-
- /* update status and timing */
- update_status_time();
-
- /* render */
- bool delayed_denoise = false;
- const bool need_denoise = render_need_denoise(delayed_denoise);
- render(need_denoise);
-
- device->task_wait();
-
- if (!device->error_message().empty())
- progress.set_cancel(device->error_message());
-
- /* update status and timing */
- update_status_time();
-
- gpu_need_display_buffer_update_ = !delayed_denoise;
- gpu_draw_ready_ = true;
- progress.set_update();
-
- /* wait for until display buffer is updated */
- if (!params.background) {
- while (gpu_need_display_buffer_update_) {
- if (progress.get_cancel())
- break;
-
- gpu_need_display_buffer_update_cond_.wait(buffers_lock);
- }
- }
-
- if (!device->error_message().empty())
- progress.set_error(device->error_message());
-
- tiles_written = update_progressive_refine(progress.get_cancel());
-
- if (progress.get_cancel())
- break;
- }
- }
-
- if (!tiles_written)
- update_progressive_refine(true);
-}
-
-/* CPU Session */
-
-void Session::reset_cpu(BufferParams &buffer_params, int samples)
-{
- thread_scoped_lock reset_lock(delayed_reset_.mutex);
- thread_scoped_lock pause_lock(pause_mutex_);
-
- display_outdated_ = true;
- reset_time_ = time_dt();
-
- delayed_reset_.params = buffer_params;
- delayed_reset_.samples = samples;
- delayed_reset_.do_reset = true;
- device->task_cancel();
-
- pause_cond_.notify_all();
-}
-
-bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
- thread_scoped_lock display_lock(display_mutex_);
-
- /* first check we already rendered something */
- if (display->draw_ready()) {
- /* then verify the buffers have the expected size, so we don't
- * draw previous results in a resized window */
- if (buffer_params.width == display->params.width &&
- buffer_params.height == display->params.height) {
- display->draw(device, draw_params);
-
- if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
- return false;
-
- return true;
- }
- }
-
- return false;
-}
-
-bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock)
-{
- /* Devices that can get their tiles stolen don't steal tiles themselves.
- * Additionally, if there are no stealable tiles in flight, give up here. */
- if (tile_device->info.type == DEVICE_CPU || stealable_tiles_ == 0) {
- return false;
- }
-
- /* Wait until no other thread is trying to steal a tile. */
- while (tile_stealing_state_ != NOT_STEALING && stealable_tiles_ > 0) {
- /* Someone else is currently trying to get a tile.
- * Wait on the condition variable and try later. */
- tile_steal_cond_.wait(tile_lock);
- }
- /* If another thread stole the last stealable tile in the meantime, give up. */
- if (stealable_tiles_ == 0) {
- return false;
- }
-
- /* There are stealable tiles in flight, so signal that one should be released. */
- tile_stealing_state_ = WAITING_FOR_TILE;
-
- /* Wait until a device notices the signal and releases its tile. */
- while (tile_stealing_state_ != GOT_TILE && stealable_tiles_ > 0) {
- tile_steal_cond_.wait(tile_lock);
- }
- /* If the last stealable tile finished on its own, give up. */
- if (tile_stealing_state_ != GOT_TILE) {
- tile_stealing_state_ = NOT_STEALING;
- return false;
- }
-
- /* Successfully stole a tile, now move it to the new device. */
- rtile = stolen_tile_;
- rtile.buffers->buffer.move_device(tile_device);
- rtile.buffer = rtile.buffers->buffer.device_pointer;
- rtile.stealing_state = RenderTile::NO_STEALING;
- rtile.num_samples -= (rtile.sample - rtile.start_sample);
- rtile.start_sample = rtile.sample;
-
- tile_stealing_state_ = NOT_STEALING;
-
- /* Poke any threads which might be waiting for NOT_STEALING above. */
- tile_steal_cond_.notify_one();
-
- return true;
-}
-
-bool Session::get_tile_stolen()
-{
- /* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE
- * and return true. */
- TileStealingState expected = WAITING_FOR_TILE;
- return tile_stealing_state_.compare_exchange_weak(expected, RELEASING_TILE);
-}
-
-bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
-{
- if (progress.get_cancel()) {
- if (params.progressive_refine == false) {
- /* for progressive refine current sample should be finished for all tiles */
- return false;
- }
- }
-
- thread_scoped_lock tile_lock(tile_mutex_);
-
- /* get next tile from manager */
- Tile *tile;
- int device_num = device->device_number(tile_device);
-
- while (!tile_manager.next_tile(tile, device_num, tile_types)) {
- /* Can only steal tiles on devices that support rendering
- * This is because denoising tiles cannot be stolen (see below)
- */
- if ((tile_types & (RenderTile::PATH_TRACE | RenderTile::BAKE)) &&
- steal_tile(rtile, tile_device, tile_lock)) {
- return true;
- }
-
- /* Wait for denoising tiles to become available */
- if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
- denoising_cond_.wait(tile_lock);
- continue;
- }
-
- return false;
- }
-
- /* fill render tile */
- rtile.x = tile_manager.state.buffer.full_x + tile->x;
- rtile.y = tile_manager.state.buffer.full_y + tile->y;
- rtile.w = tile->w;
- rtile.h = tile->h;
- rtile.start_sample = tile_manager.state.sample;
- rtile.num_samples = tile_manager.state.num_samples;
- rtile.resolution = tile_manager.state.resolution_divider;
- rtile.tile_index = tile->index;
- rtile.stealing_state = RenderTile::NO_STEALING;
-
- if (tile->state == Tile::DENOISE) {
- rtile.task = RenderTile::DENOISE;
- }
- else {
- if (tile_device->info.type == DEVICE_CPU) {
- stealable_tiles_++;
- rtile.stealing_state = RenderTile::CAN_BE_STOLEN;
- }
-
- if (read_bake_tile_cb) {
- rtile.task = RenderTile::BAKE;
- }
- else {
- rtile.task = RenderTile::PATH_TRACE;
- }
- }
-
- tile_lock.unlock();
-
- /* in case of a permanent buffer, return it, otherwise we will allocate
- * a new temporary buffer */
- if (buffers) {
- tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
- rtile.buffer = buffers->buffer.device_pointer;
- rtile.buffers = buffers;
-
- device->map_tile(tile_device, rtile);
-
- /* Reset copy state, since buffer contents change after the tile was acquired */
- buffers->map_neighbor_copied = false;
-
- /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
- * for the buffer resolution divider. */
- buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
- tile_manager.state.resolution_divider;
- buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
-
- return true;
- }
-
- if (tile->buffers == NULL) {
- /* fill buffer parameters */
- BufferParams buffer_params = tile_manager.params;
- buffer_params.full_x = rtile.x;
- buffer_params.full_y = rtile.y;
- buffer_params.width = rtile.w;
- buffer_params.height = rtile.h;
-
- /* allocate buffers */
- tile->buffers = new RenderBuffers(tile_device);
- tile->buffers->reset(buffer_params);
- }
- else if (tile->buffers->buffer.device != tile_device) {
- /* Move buffer to current tile device again in case it was stolen before.
- * Not needed for denoising since that already handles mapping of tiles and
- * neighbors to its own device. */
- if (rtile.task != RenderTile::DENOISE) {
- tile->buffers->buffer.move_device(tile_device);
- }
- }
-
- tile->buffers->map_neighbor_copied = false;
-
- tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
- rtile.buffer = tile->buffers->buffer.device_pointer;
- rtile.buffers = tile->buffers;
- rtile.sample = tile_manager.state.sample;
-
- if (read_bake_tile_cb) {
- /* This will read any passes needed as input for baking. */
- if (tile_manager.state.sample == tile_manager.range_start_sample) {
- {
- thread_scoped_lock tile_lock(tile_mutex_);
- read_bake_tile_cb(rtile);
- }
- rtile.buffers->buffer.copy_to_device();
- }
- }
- else {
- /* This will tag tile as IN PROGRESS in blender-side render pipeline,
- * which is needed to highlight currently rendering tile before first
- * sample was processed for it. */
- update_tile_sample(rtile);
- }
-
- return true;
-}
-
-void Session::update_tile_sample(RenderTile &rtile)
-{
- thread_scoped_lock tile_lock(tile_mutex_);
-
- if (update_render_tile_cb) {
- if (params.progressive_refine == false) {
- /* todo: optimize this by making it thread safe and removing lock */
-
- update_render_tile_cb(rtile, true);
- }
- }
-
- update_status_time();
-}
-
-void Session::release_tile(RenderTile &rtile, const bool need_denoise)
-{
- thread_scoped_lock tile_lock(tile_mutex_);
-
- if (rtile.stealing_state != RenderTile::NO_STEALING) {
- stealable_tiles_--;
- if (rtile.stealing_state == RenderTile::WAS_STOLEN) {
- /* If the tile is being stolen, don't release it here - the new device will pick up where
- * the old one left off. */
-
- assert(tile_stealing_state_ == RELEASING_TILE);
- assert(rtile.sample < rtile.start_sample + rtile.num_samples);
-
- tile_stealing_state_ = GOT_TILE;
- stolen_tile_ = rtile;
- tile_steal_cond_.notify_all();
- return;
- }
- else if (stealable_tiles_ == 0) {
- /* If this was the last stealable tile, wake up any threads still waiting for one. */
- tile_steal_cond_.notify_all();
- }
- }
-
- progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
-
- bool delete_tile;
-
- if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) {
- /* Finished tile pixels write. */
- if (write_render_tile_cb && params.progressive_refine == false) {
- write_render_tile_cb(rtile);
- }
-
- if (delete_tile) {
- delete rtile.buffers;
- tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
- }
- }
- else {
- /* In progress tile pixels update. */
- if (update_render_tile_cb && params.progressive_refine == false) {
- update_render_tile_cb(rtile, false);
- }
- }
-
- update_status_time();
-
- /* Notify denoising thread that a tile was finished. */
- denoising_cond_.notify_all();
-}
-
-void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
- thread_scoped_lock tile_lock(tile_mutex_);
-
- const int4 image_region = make_int4(
- tile_manager.state.buffer.full_x,
- tile_manager.state.buffer.full_y,
- tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
- tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
-
- RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-
- if (!tile_manager.schedule_denoising) {
- /* Fix up tile slices with overlap. */
- if (tile_manager.slice_overlap != 0) {
- int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y);
- center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap,
- image_region.w) -
- y;
- center_tile.y = y;
- }
-
- /* Tiles are not being denoised individually, which means the entire image is processed. */
- neighbors.set_bounds_from_center();
- }
- else {
- int center_idx = center_tile.tile_index;
- assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-
- for (int dy = -1, i = 0; dy <= 1; dy++) {
- for (int dx = -1; dx <= 1; dx++, i++) {
- RenderTile &rtile = neighbors.tiles[i];
- int nindex = tile_manager.get_neighbor_index(center_idx, i);
- if (nindex >= 0) {
- Tile *tile = &tile_manager.state.tiles[nindex];
-
- rtile.x = image_region.x + tile->x;
- rtile.y = image_region.y + tile->y;
- rtile.w = tile->w;
- rtile.h = tile->h;
-
- if (buffers) {
- tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
- rtile.buffer = buffers->buffer.device_pointer;
- rtile.buffers = buffers;
- }
- else {
- assert(tile->buffers);
- tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
- rtile.buffer = tile->buffers->buffer.device_pointer;
- rtile.buffers = tile->buffers;
- }
- }
- else {
- int px = center_tile.x + dx * params.tile_size.x;
- int py = center_tile.y + dy * params.tile_size.y;
-
- rtile.x = clamp(px, image_region.x, image_region.z);
- rtile.y = clamp(py, image_region.y, image_region.w);
- rtile.w = rtile.h = 0;
-
- rtile.buffer = (device_ptr)NULL;
- rtile.buffers = NULL;
- }
- }
- }
- }
-
- assert(center_tile.buffers);
- device->map_neighbor_tiles(tile_device, neighbors);
-
- /* The denoised result is written back to the original tile. */
- neighbors.target = center_tile;
-}
-
-void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
- thread_scoped_lock tile_lock(tile_mutex_);
- device->unmap_neighbor_tiles(tile_device, neighbors);
-}
-
-void Session::run_cpu()
-{
- bool tiles_written = false;
-
- last_update_time_ = time_dt();
- last_display_time_ = last_update_time_;
-
- while (!progress.get_cancel()) {
- const bool no_tiles = !run_update_for_next_iteration();
- bool need_copy_to_display_buffer = false;
-
- if (no_tiles) {
- if (params.background) {
- /* if no work left and in background mode, we can stop immediately */
- progress.set_status("Finished");
+ const bool did_cancel = progress.get_cancel();
+ if (did_cancel) {
+ render_scheduler_.render_work_reschedule_on_cancel(render_work);
+ if (!render_work) {
break;
}
}
-
- if (run_wait_for_work(no_tiles)) {
+ else if (run_wait_for_work(render_work)) {
continue;
}
- if (progress.get_cancel()) {
- break;
- }
-
- if (!no_tiles) {
- if (!device->error_message().empty())
- progress.set_error(device->error_message());
-
- if (progress.get_cancel())
- break;
-
+ {
/* buffers mutex is locked entirely while rendering each
* sample, and released/reacquired on each iteration to allow
* reset and draw in between */
@@ -730,49 +204,25 @@ void Session::run_cpu()
update_status_time();
/* render */
- bool delayed_denoise = false;
- const bool need_denoise = render_need_denoise(delayed_denoise);
- render(need_denoise);
+ path_trace_->render(render_work);
/* update status and timing */
update_status_time();
- if (!params.background)
- need_copy_to_display_buffer = !delayed_denoise;
-
- if (!device->error_message().empty())
- progress.set_error(device->error_message());
- }
-
- device->task_wait();
-
- {
- thread_scoped_lock reset_lock(delayed_reset_.mutex);
- thread_scoped_lock buffers_lock(buffers_mutex_);
- thread_scoped_lock display_lock(display_mutex_);
-
- if (delayed_reset_.do_reset) {
- /* reset rendering if request from main thread */
- delayed_reset_.do_reset = false;
- reset_(delayed_reset_.params, delayed_reset_.samples);
- }
- else if (need_copy_to_display_buffer) {
- /* Only copy to display_buffer if we do not reset, we don't
- * want to show the result of an incomplete sample */
- copy_to_display_buffer(tile_manager.state.sample);
+ if (device->have_error()) {
+ const string &error_message = device->error_message();
+ progress.set_error(error_message);
+ progress.set_cancel(error_message);
+ break;
}
-
- if (!device->error_message().empty())
- progress.set_error(device->error_message());
-
- tiles_written = update_progressive_refine(progress.get_cancel());
}
progress.set_update();
- }
- if (!tiles_written)
- update_progressive_refine(true);
+ if (did_cancel) {
+ break;
+ }
+ }
}
void Session::run()
@@ -789,10 +239,7 @@ void Session::run()
/* reset number of rendered samples */
progress.reset_sample();
- if (device_use_gl_)
- run_gpu();
- else
- run_cpu();
+ run_main_render_loop();
}
profiler.stop();
@@ -804,31 +251,92 @@ void Session::run()
progress.set_update();
}
-bool Session::run_update_for_next_iteration()
+RenderWork Session::run_update_for_next_iteration()
{
+ RenderWork render_work;
+
thread_scoped_lock scene_lock(scene->mutex);
thread_scoped_lock reset_lock(delayed_reset_.mutex);
+ bool have_tiles = true;
+ bool switched_to_new_tile = false;
+
if (delayed_reset_.do_reset) {
thread_scoped_lock buffers_lock(buffers_mutex_);
- reset_(delayed_reset_.params, delayed_reset_.samples);
- delayed_reset_.do_reset = false;
+ do_delayed_reset();
+
+ /* After reset make sure the tile manager is at the first big tile. */
+ have_tiles = tile_manager_.next();
+ switched_to_new_tile = true;
+ }
+
+ /* Update number of samples in the integrator.
+ * Ideally this would need to happen once in `Session::set_samples()`, but the issue there is
+ * the initial configuration when Session is created where the `set_samples()` is not used. */
+ scene->integrator->set_aa_samples(params.samples);
+
+ /* Update denoiser settings. */
+ {
+ const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+ path_trace_->set_denoiser_params(denoise_params);
+ }
+
+ /* Update adaptive sampling. */
+ {
+ const AdaptiveSampling adaptive_sampling = scene->integrator->get_adaptive_sampling();
+ path_trace_->set_adaptive_sampling(adaptive_sampling);
}
- const bool have_tiles = tile_manager.next();
+ render_scheduler_.set_num_samples(params.samples);
+ render_scheduler_.set_time_limit(params.time_limit);
+
+ while (have_tiles) {
+ render_work = render_scheduler_.get_render_work();
+ if (render_work) {
+ break;
+ }
- if (have_tiles) {
+ progress.add_finished_tile(false);
+
+ have_tiles = tile_manager_.next();
+ if (have_tiles) {
+ render_scheduler_.reset_for_next_tile();
+ switched_to_new_tile = true;
+ }
+ }
+
+ if (render_work) {
scoped_timer update_timer;
- if (update_scene()) {
+
+ if (switched_to_new_tile) {
+ BufferParams tile_params = buffer_params_;
+
+ const Tile &tile = tile_manager_.get_current_tile();
+ tile_params.width = tile.width;
+ tile_params.height = tile.height;
+ tile_params.full_x = tile.x + buffer_params_.full_x;
+ tile_params.full_y = tile.y + buffer_params_.full_y;
+ tile_params.full_width = buffer_params_.full_width;
+ tile_params.full_height = buffer_params_.full_height;
+ tile_params.update_offset_stride();
+
+ path_trace_->reset(buffer_params_, tile_params);
+ }
+
+ const int resolution = render_work.resolution_divider;
+ const int width = max(1, buffer_params_.full_width / resolution);
+ const int height = max(1, buffer_params_.full_height / resolution);
+
+ if (update_scene(width, height)) {
profiler.reset(scene->shaders.size(), scene->objects.size());
}
progress.add_skip_time(update_timer, params.background);
}
- return have_tiles;
+ return render_work;
}
-bool Session::run_wait_for_work(bool no_tiles)
+bool Session::run_wait_for_work(const RenderWork &render_work)
{
/* In an offline rendering there is no pause, and no tiles will mean the job is fully done. */
if (params.background) {
@@ -837,19 +345,20 @@ bool Session::run_wait_for_work(bool no_tiles)
thread_scoped_lock pause_lock(pause_mutex_);
- if (!pause_ && !no_tiles) {
+ if (!pause_ && render_work) {
/* Rendering is not paused and there is work to be done. No need to wait for anything. */
return false;
}
- update_status_time(pause_, no_tiles);
+ const bool no_work = !render_work;
+ update_status_time(pause_, no_work);
/* Only leave the loop when rendering is not paused. But even if the current render is un-paused
* but there is nothing to render keep waiting until new work is added. */
while (!cancel_) {
scoped_timer pause_timer;
- if (!pause_ && (!no_tiles || new_work_added_ || delayed_reset_.do_reset)) {
+ if (!pause_ && (render_work || new_work_added_ || delayed_reset_.do_reset)) {
break;
}
@@ -860,52 +369,89 @@ bool Session::run_wait_for_work(bool no_tiles)
progress.add_skip_time(pause_timer, params.background);
}
- update_status_time(pause_, no_tiles);
+ update_status_time(pause_, no_work);
progress.set_update();
}
new_work_added_ = false;
- return no_tiles;
+ return no_work;
}
-bool Session::draw(BufferParams &buffer_params, DeviceDrawParams &draw_params)
+void Session::draw()
{
- if (device_use_gl_)
- return draw_gpu(buffer_params, draw_params);
- else
- return draw_cpu(buffer_params, draw_params);
+ path_trace_->draw();
}
-void Session::reset_(BufferParams &buffer_params, int samples)
+int2 Session::get_effective_tile_size() const
{
- if (buffers && buffer_params.modified(tile_manager.params)) {
- gpu_draw_ready_ = false;
- buffers->reset(buffer_params);
- if (display) {
- display->reset(buffer_params);
- }
+ /* No support yet for baking with tiles. */
+ if (!params.use_auto_tile || scene->bake_manager->get_baking()) {
+ return make_int2(buffer_params_.width, buffer_params_.height);
}
- tile_manager.reset(buffer_params, samples);
- stealable_tiles_ = 0;
- tile_stealing_state_ = NOT_STEALING;
- progress.reset_sample();
+ /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile
+ * and prefer optimal performance. */
+
+ return make_int2(params.tile_size, params.tile_size);
+}
+
+void Session::do_delayed_reset()
+{
+ if (!delayed_reset_.do_reset) {
+ return;
+ }
+ delayed_reset_.do_reset = false;
+
+ params = delayed_reset_.session_params;
+ buffer_params_ = delayed_reset_.buffer_params;
+
+ /* Store parameters used for buffers access outside of scene graph. */
+ buffer_params_.samples = params.samples;
+ buffer_params_.exposure = scene->film->get_exposure();
+ buffer_params_.use_approximate_shadow_catcher =
+ scene->film->get_use_approximate_shadow_catcher();
+ buffer_params_.use_transparent_background = scene->background->get_transparent();
- bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
- progress.set_total_pixel_samples(show_progress ? tile_manager.state.total_pixel_samples : 0);
+ /* Tile and work scheduling. */
+ tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
+ render_scheduler_.reset(buffer_params_, params.samples);
- if (!params.background)
+ /* Passes. */
+ /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
+ * tile results. It is safe to use generic update function here which checks for changes since
+ * changes in tile settings re-creates session, which ensures film is fully updated on tile
+ * changes. */
+ scene->film->update_passes(scene, tile_manager_.has_multiple_tiles());
+
+ /* Update for new state of scene and passes. */
+ buffer_params_.update_passes(scene->passes);
+ tile_manager_.update(buffer_params_, scene);
+
+ /* Progress. */
+ progress.reset_sample();
+ progress.set_total_pixel_samples(buffer_params_.width * buffer_params_.height * params.samples);
+
+ if (!params.background) {
progress.set_start_time();
+ }
progress.set_render_start_time();
}
-void Session::reset(BufferParams &buffer_params, int samples)
+void Session::reset(const SessionParams &session_params, const BufferParams &buffer_params)
{
- if (device_use_gl_)
- reset_gpu(buffer_params, samples);
- else
- reset_cpu(buffer_params, samples);
+ {
+ thread_scoped_lock reset_lock(delayed_reset_.mutex);
+ thread_scoped_lock pause_lock(pause_mutex_);
+
+ delayed_reset_.do_reset = true;
+ delayed_reset_.session_params = session_params;
+ delayed_reset_.buffer_params = buffer_params;
+
+ path_trace_->cancel();
+ }
+
+ pause_cond_.notify_all();
}
void Session::set_samples(int samples)
@@ -915,7 +461,22 @@ void Session::set_samples(int samples)
}
params.samples = samples;
- tile_manager.set_samples(samples);
+
+ {
+ thread_scoped_lock pause_lock(pause_mutex_);
+ new_work_added_ = true;
+ }
+
+ pause_cond_.notify_all();
+}
+
+void Session::set_time_limit(double time_limit)
+{
+ if (time_limit == params.time_limit) {
+ return;
+ }
+
+ params.time_limit = time_limit;
{
thread_scoped_lock pause_lock(pause_mutex_);
@@ -948,38 +509,9 @@ void Session::set_pause(bool pause)
}
}
-void Session::set_denoising(const DenoiseParams &denoising)
+void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
{
- bool need_denoise = denoising.need_denoising_task();
-
- /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
- thread_scoped_lock buffers_lock(buffers_mutex_);
- params.denoising = denoising;
-
- if (!(params.device.denoisers & denoising.type)) {
- if (need_denoise) {
- progress.set_error("Denoiser type not supported by compute device");
- }
-
- params.denoising.use = false;
- need_denoise = false;
- }
-
- // TODO(pmours): Query the required overlap value for denoising from the device?
- tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
-
- /* Schedule per tile denoising for final renders if we are either denoising or
- * need prefiltered passes for the native denoiser. */
- tile_manager.schedule_denoising = need_denoise && !buffers;
-}
-
-void Session::set_denoising_start_sample(int sample)
-{
- if (sample != params.denoising.start_sample) {
- params.denoising.start_sample = sample;
-
- pause_cond_.notify_all();
- }
+ path_trace_->set_gpu_display(move(gpu_display));
}
void Session::wait()
@@ -989,81 +521,67 @@ void Session::wait()
delete session_thread_;
}
- session_thread_ = NULL;
+ session_thread_ = nullptr;
}
-bool Session::update_scene()
+bool Session::update_scene(int width, int height)
{
- /* update camera if dimensions changed for progressive render. the camera
+ /* Update camera if dimensions changed for progressive render. the camera
* knows nothing about progressive or cropped rendering, it just gets the
- * image dimensions passed in */
+ * image dimensions passed in. */
Camera *cam = scene->camera;
- int width = tile_manager.state.buffer.full_width;
- int height = tile_manager.state.buffer.full_height;
- int resolution = tile_manager.state.resolution_divider;
-
- cam->set_screen_size_and_resolution(width, height, resolution);
+ cam->set_screen_size(width, height);
- /* number of samples is needed by multi jittered
- * sampling pattern and by baking */
- Integrator *integrator = scene->integrator;
- BakeManager *bake_manager = scene->bake_manager;
+ /* First detect which kernel features are used and allocate working memory.
+ * This helps estimate how may device memory is available for the scene and
+ * how much we need to allocate on the host instead. */
+ scene->update_kernel_features();
- if (integrator->get_sampling_pattern() != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
- integrator->set_aa_samples(tile_manager.num_samples);
- }
+ path_trace_->load_kernels();
+ path_trace_->alloc_work_memory();
- bool kernel_switch_needed = false;
- if (scene->update(progress, kernel_switch_needed)) {
- if (kernel_switch_needed) {
- reset(tile_manager.params, params.samples);
- }
+ if (scene->update(progress)) {
return true;
}
+
return false;
}
+static string status_append(const string &status, const string &suffix)
+{
+ string prefix = status;
+ if (!prefix.empty()) {
+ prefix += ", ";
+ }
+ return prefix + suffix;
+}
+
void Session::update_status_time(bool show_pause, bool show_done)
{
- int progressive_sample = tile_manager.state.sample;
- int num_samples = tile_manager.get_num_effective_samples();
+ string status, substatus;
- int tile = progress.get_rendered_tiles();
- int num_tiles = tile_manager.state.num_tiles;
+ const int current_tile = progress.get_rendered_tiles();
+ const int num_tiles = tile_manager_.get_num_tiles();
- /* update status */
- string status, substatus;
+ const int current_sample = progress.get_current_sample();
+ const int num_samples = render_scheduler_.get_num_samples();
- if (!params.progressive) {
- const bool is_cpu = params.device.type == DEVICE_CPU;
- const bool rendering_finished = (tile == num_tiles);
- const bool is_last_tile = (tile + 1) == num_tiles;
-
- substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
-
- if (!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
- /* Some devices automatically support showing the sample number:
- * - CUDADevice
- * - OpenCLDevice when using the megakernel (the split kernel renders multiple
- * samples at the same time, so the current sample isn't really defined)
- * - CPUDevice when using one thread
- * For these devices, the current sample is always shown.
- *
- * The other option is when the last tile is currently being rendered by the CPU.
- */
- substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
- }
- if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
- substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
- }
- else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
- substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
- }
+ /* TIle. */
+ if (tile_manager_.has_multiple_tiles()) {
+ substatus = status_append(substatus,
+ string_printf("Rendered %d/%d Tiles", current_tile, num_tiles));
}
- else if (tile_manager.num_samples == Integrator::MAX_SAMPLES)
- substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1);
- else
- substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples);
+
+ /* Sample. */
+ if (num_samples == Integrator::MAX_SAMPLES) {
+ substatus = status_append(substatus, string_printf("Sample %d", current_sample));
+ }
+ else {
+ substatus = status_append(substatus,
+ string_printf("Sample %d/%d", current_sample, num_samples));
+ }
+
+ /* TODO(sergey): Denoising status from the path trace. */
if (show_pause) {
status = "Rendering Paused";
@@ -1080,210 +598,122 @@ void Session::update_status_time(bool show_pause, bool show_done)
progress.set_status(status, substatus);
}
-bool Session::render_need_denoise(bool &delayed)
+void Session::device_free()
{
- delayed = false;
-
- /* Not supported yet for baking. */
- if (read_bake_tile_cb) {
- return false;
- }
-
- /* Denoising enabled? */
- if (!params.denoising.need_denoising_task()) {
- return false;
- }
-
- if (params.background) {
- /* Background render, only denoise when rendering the last sample. */
- return tile_manager.done();
- }
-
- /* Viewport render. */
-
- /* It can happen that denoising was already enabled, but the scene still needs an update. */
- if (scene->film->is_modified() || !scene->film->get_denoising_data_offset()) {
- return false;
- }
+ scene->device_free();
+ path_trace_->device_free();
+}
- /* Immediately denoise when we reach the start sample or last sample. */
- const int num_samples_finished = tile_manager.state.sample + 1;
- if (num_samples_finished == params.denoising.start_sample ||
- num_samples_finished == params.samples) {
- return true;
+void Session::collect_statistics(RenderStats *render_stats)
+{
+ scene->collect_statistics(render_stats);
+ if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
+ render_stats->collect_profiling(scene, profiler);
}
+}
- /* Do not denoise until the sample at which denoising should start is reached. */
- if (num_samples_finished < params.denoising.start_sample) {
- return false;
- }
+/* --------------------------------------------------------------------
+ * Tile and tile pixels access.
+ */
- /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */
- delayed = (tile_manager.state.sample >= 20 &&
- (time_dt() - last_display_time_) < params.progressive_update_timeout);
- return !delayed;
+bool Session::has_multiple_render_tiles() const
+{
+ return tile_manager_.has_multiple_tiles();
}
-void Session::render(bool need_denoise)
+int2 Session::get_render_tile_size() const
{
- if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
- /* Clear buffers. */
- buffers->zero();
- }
-
- if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
- return; /* Avoid empty launches. */
- }
+ return path_trace_->get_render_tile_size();
+}
- /* Add path trace task. */
- DeviceTask task(DeviceTask::RENDER);
-
- task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
- task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise);
- task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
- task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
- task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
- task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
- task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
- task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this);
- task.need_finish_queue = params.progressive_refine;
- task.integrator_branched = scene->integrator->get_method() == Integrator::BRANCHED_PATH;
-
- task.adaptive_sampling.use = (scene->integrator->get_sampling_pattern() ==
- SAMPLING_PATTERN_PMJ) &&
- scene->dscene.data.film.pass_adaptive_aux_buffer;
- task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
- task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
-
- /* Acquire render tiles by default. */
- task.tile_types = RenderTile::PATH_TRACE;
-
- if (need_denoise) {
- task.denoising = params.denoising;
-
- task.pass_stride = scene->film->get_pass_stride();
- task.target_pass_stride = task.pass_stride;
- task.pass_denoising_data = scene->film->get_denoising_data_offset();
- task.pass_denoising_clean = scene->film->get_denoising_clean_offset();
-
- task.denoising_from_render = true;
-
- if (tile_manager.schedule_denoising) {
- /* Acquire denoising tiles during rendering. */
- task.tile_types |= RenderTile::DENOISE;
- }
- else {
- assert(buffers);
-
- /* Schedule rendering and wait for it to finish. */
- device->task_add(task);
- device->task_wait();
-
- /* Then run denoising on the whole image at once. */
- task.type = DeviceTask::DENOISE_BUFFER;
- task.x = tile_manager.state.buffer.full_x;
- task.y = tile_manager.state.buffer.full_y;
- task.w = tile_manager.state.buffer.width;
- task.h = tile_manager.state.buffer.height;
- task.buffer = buffers->buffer.device_pointer;
- task.sample = tile_manager.state.sample;
- task.num_samples = tile_manager.state.num_samples;
- tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
- task.buffers = buffers;
- }
- }
+int2 Session::get_render_tile_offset() const
+{
+ return path_trace_->get_render_tile_offset();
+}
- device->task_add(task);
+string_view Session::get_render_tile_layer() const
+{
+ const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+ return buffer_params.layer;
}
-void Session::copy_to_display_buffer(int sample)
+string_view Session::get_render_tile_view() const
{
- /* add film conversion task */
- DeviceTask task(DeviceTask::FILM_CONVERT);
-
- task.x = tile_manager.state.buffer.full_x;
- task.y = tile_manager.state.buffer.full_y;
- task.w = tile_manager.state.buffer.width;
- task.h = tile_manager.state.buffer.height;
- task.rgba_byte = display->rgba_byte.device_pointer;
- task.rgba_half = display->rgba_half.device_pointer;
- task.buffer = buffers->buffer.device_pointer;
- task.sample = sample;
- tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-
- if (task.w > 0 && task.h > 0) {
- device->task_add(task);
- device->task_wait();
-
- /* set display to new size */
- display->draw_set(task.w, task.h);
-
- last_display_time_ = time_dt();
- }
+ const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+ return buffer_params.view;
+}
- display_outdated_ = false;
+bool Session::copy_render_tile_from_device()
+{
+ return path_trace_->copy_render_tile_from_device();
}
-bool Session::update_progressive_refine(bool cancel)
+bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels)
{
- int sample = tile_manager.state.sample + 1;
- bool write = sample == tile_manager.num_samples || cancel;
+ /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+ * is happening while this function runs. */
- double current_time = time_dt();
+ const BufferParams &buffer_params = path_trace_->get_render_tile_params();
- if (current_time - last_update_time_ < params.progressive_update_timeout) {
- /* If last sample was processed, we need to write buffers anyway. */
- if (!write && sample != 1)
- return false;
+ const BufferPass *pass = buffer_params.find_pass(pass_name);
+ if (pass == nullptr) {
+ return false;
}
- if (params.progressive_refine) {
- foreach (Tile &tile, tile_manager.state.tiles) {
- if (!tile.buffers) {
- continue;
- }
-
- RenderTile rtile;
- rtile.x = tile_manager.state.buffer.full_x + tile.x;
- rtile.y = tile_manager.state.buffer.full_y + tile.y;
- rtile.w = tile.w;
- rtile.h = tile.h;
- rtile.sample = sample;
- rtile.buffers = tile.buffers;
-
- if (write) {
- if (write_render_tile_cb)
- write_render_tile_cb(rtile);
- }
- else {
- if (update_render_tile_cb)
- update_render_tile_cb(rtile, true);
- }
+ const bool has_denoised_result = path_trace_->has_denoised_result();
+ if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
+ pass = buffer_params.find_pass(pass->type);
+ if (pass == nullptr) {
+ /* Happens when denoised result pass is requested but is never written by the kernel. */
+ return false;
}
}
- last_update_time_ = current_time;
+ pass = buffer_params.get_actual_display_pass(pass);
+
+ const float exposure = buffer_params.exposure;
+ const int num_samples = path_trace_->get_num_render_tile_samples();
- return write;
+ PassAccessor::PassAccessInfo pass_access_info(*pass);
+ pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
+ pass_access_info.use_approximate_shadow_catcher_background =
+ pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
+
+ const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+ const PassAccessor::Destination destination(pixels, num_components);
+
+ return path_trace_->get_render_tile_pixels(pass_accessor, destination);
}
-void Session::device_free()
+bool Session::set_render_tile_pixels(const string &pass_name,
+ int num_components,
+ const float *pixels)
{
- scene->device_free();
+ /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+ * is happening while this function runs. */
+
+ const BufferPass *pass = buffer_params_.find_pass(pass_name);
+ if (!pass) {
+ return false;
+ }
+
+ const float exposure = scene->film->get_exposure();
+ const int num_samples = render_scheduler_.get_num_rendered_samples();
- tile_manager.device_free();
+ const PassAccessor::PassAccessInfo pass_access_info(*pass);
+ PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+ PassAccessor::Source source(pixels, num_components);
- /* used from background render only, so no need to
- * re-create render/display buffers here
- */
+ return path_trace_->set_render_tile_pixels(pass_accessor, source);
}
-void Session::collect_statistics(RenderStats *render_stats)
+/* --------------------------------------------------------------------
+ * Full-frame on-disk storage.
+ */
+
+void Session::process_full_buffer_from_disk(string_view filename)
{
- scene->collect_statistics(render_stats);
- if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
- render_stats->collect_profiling(scene, profiler);
- }
+ path_trace_->process_full_buffer_from_disk(filename);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 05025c10f9c..5623604bfe8 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -18,6 +18,7 @@
#define __SESSION_H__
#include "device/device.h"
+#include "integrator/render_scheduler.h"
#include "render/buffers.h"
#include "render/shader.h"
#include "render/stats.h"
@@ -26,6 +27,7 @@
#include "util/util_progress.h"
#include "util/util_stats.h"
#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
@@ -33,41 +35,35 @@ CCL_NAMESPACE_BEGIN
class BufferParams;
class Device;
class DeviceScene;
-class DeviceRequestedFeatures;
-class DisplayBuffer;
+class PathTrace;
class Progress;
+class GPUDisplay;
class RenderBuffers;
class Scene;
+class SceneParams;
/* Session Parameters */
class SessionParams {
public:
DeviceInfo device;
+
+ bool headless;
bool background;
- bool progressive_refine;
- bool progressive;
bool experimental;
int samples;
- int2 tile_size;
- TileOrder tile_order;
- int start_resolution;
- int denoising_start_sample;
int pixel_size;
int threads;
- bool adaptive_sampling;
-
- bool use_profiling;
- bool display_buffer_linear;
+ /* Limit in seconds for how long path tracing is allowed to happen.
+ * Zero means no limit is applied. */
+ double time_limit;
- DenoiseParams denoising;
+ bool use_profiling;
- double cancel_timeout;
- double reset_timeout;
- double text_timeout;
- double progressive_update_timeout;
+ bool use_auto_tile;
+ int tile_size;
ShadingSystem shadingsystem;
@@ -75,50 +71,32 @@ class SessionParams {
SessionParams()
{
+ headless = false;
background = false;
- progressive_refine = false;
- progressive = false;
experimental = false;
samples = 1024;
- tile_size = make_int2(64, 64);
- start_resolution = INT_MAX;
- denoising_start_sample = 0;
pixel_size = 1;
threads = 0;
- adaptive_sampling = false;
+ time_limit = 0.0;
use_profiling = false;
- display_buffer_linear = false;
-
- cancel_timeout = 0.1;
- reset_timeout = 0.1;
- text_timeout = 1.0;
- progressive_update_timeout = 1.0;
+ use_auto_tile = true;
+ tile_size = 2048;
shadingsystem = SHADINGSYSTEM_SVM;
- tile_order = TILE_CENTER;
}
- bool modified(const SessionParams &params)
+ bool modified(const SessionParams &params) const
{
/* Modified means we have to recreate the session, any parameter changes
* that can be handled by an existing Session are omitted. */
- return !(device == params.device && background == params.background &&
- progressive_refine == params.progressive_refine &&
- progressive == params.progressive && experimental == params.experimental &&
- tile_size == params.tile_size && start_resolution == params.start_resolution &&
+ return !(device == params.device && headless == params.headless &&
+ background == params.background && experimental == params.experimental &&
pixel_size == params.pixel_size && threads == params.threads &&
- adaptive_sampling == params.adaptive_sampling &&
- use_profiling == params.use_profiling &&
- display_buffer_linear == params.display_buffer_linear &&
- cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
- text_timeout == params.text_timeout &&
- progressive_update_timeout == params.progressive_update_timeout &&
- tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
- denoising.type == params.denoising.type &&
- (denoising.use == params.denoising.use || (device.denoisers & denoising.type)));
+ use_profiling == params.use_profiling && shadingsystem == params.shadingsystem &&
+ use_auto_tile == params.use_auto_tile && tile_size == params.tile_size);
}
};
@@ -131,34 +109,41 @@ class Session {
public:
Device *device;
Scene *scene;
- RenderBuffers *buffers;
- DisplayBuffer *display;
Progress progress;
SessionParams params;
- TileManager tile_manager;
Stats stats;
Profiler profiler;
- function<void(RenderTile &)> write_render_tile_cb;
- function<void(RenderTile &, bool)> update_render_tile_cb;
- function<void(RenderTile &)> read_bake_tile_cb;
+ function<void(void)> write_render_tile_cb;
+ function<void(void)> update_render_tile_cb;
+ function<void(void)> read_render_tile_cb;
+
+ /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after
+ * writing. Allows an engine integration to keep track of those files without worry about
+ * transferring the information when it needs to re-create session during rendering. */
+ function<void(string_view)> full_buffer_written_cb;
- explicit Session(const SessionParams &params);
+ explicit Session(const SessionParams &params, const SceneParams &scene_params);
~Session();
void start();
- void cancel();
- bool draw(BufferParams &params, DeviceDrawParams &draw_params);
+
+ /* When quick cancel is requested path tracing is cancels as soon as possible, without waiting
+ * for the buffer to be uniformly sampled. */
+ void cancel(bool quick = false);
+
+ void draw();
void wait();
bool ready_to_reset();
- void reset(BufferParams &params, int samples);
+ void reset(const SessionParams &session_params, const BufferParams &buffer_params);
+
void set_pause(bool pause);
+
void set_samples(int samples);
- void set_denoising(const DenoiseParams &denoising);
- void set_denoising_start_sample(int sample);
+ void set_time_limit(double time_limit);
- bool update_scene();
+ void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
void device_free();
@@ -168,83 +153,95 @@ class Session {
void collect_statistics(RenderStats *stats);
- protected:
- struct DelayedReset {
- thread_mutex mutex;
- bool do_reset;
- BufferParams params;
- int samples;
- } delayed_reset_;
+ /* --------------------------------------------------------------------
+ * Tile and tile pixels access.
+ */
- void run();
+ bool has_multiple_render_tiles() const;
- bool run_update_for_next_iteration();
- bool run_wait_for_work(bool no_tiles);
+ /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */
+ int2 get_render_tile_size() const;
+ int2 get_render_tile_offset() const;
- void update_status_time(bool show_pause = false, bool show_done = false);
+ string_view get_render_tile_layer() const;
+ string_view get_render_tile_view() const;
- void render(bool use_denoise);
- void copy_to_display_buffer(int sample);
+ bool copy_render_tile_from_device();
- void reset_(BufferParams &params, int samples);
+ bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels);
+ bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels);
- void run_cpu();
- bool draw_cpu(BufferParams &params, DeviceDrawParams &draw_params);
- void reset_cpu(BufferParams &params, int samples);
+ /* --------------------------------------------------------------------
+ * Full-frame on-disk storage.
+ */
- void run_gpu();
- bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
- void reset_gpu(BufferParams &params, int samples);
+ /* Read given full-frame file from disk, perform needed processing and write it to the software
+ * via the write callback. */
+ void process_full_buffer_from_disk(string_view filename);
- bool render_need_denoise(bool &delayed);
+ protected:
+ struct DelayedReset {
+ thread_mutex mutex;
+ bool do_reset;
+ SessionParams session_params;
+ BufferParams buffer_params;
+ } delayed_reset_;
- bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock);
- bool get_tile_stolen();
- bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
- void update_tile_sample(RenderTile &tile);
- void release_tile(RenderTile &tile, const bool need_denoise);
+ void run();
- void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
- void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+ /* Update for the new iteration of the main loop in run implementation (run_cpu and run_gpu).
+ *
+ * Will take care of the following things:
+ * - Delayed reset
+ * - Scene update
+ * - Tile manager advance
+ * - Render scheduler work request
+ *
+ * The updates are done in a proper order with proper locking around them, which guarantees
+ * that the device side of scene and render buffers are always in a consistent state.
+ *
+ * Returns render work which is to be rendered next. */
+ RenderWork run_update_for_next_iteration();
+
+ /* Wait for rendering to be unpaused, or for new tiles for render to arrive.
+ * Returns true if new main render loop iteration is required after this function call.
+ *
+ * The `render_work` is the work which was scheduled by the render scheduler right before
+ * checking the pause. */
+ bool run_wait_for_work(const RenderWork &render_work);
+
+ void run_main_render_loop();
+
+ bool update_scene(int width, int height);
- bool device_use_gl_;
+ void update_status_time(bool show_pause = false, bool show_done = false);
- thread *session_thread_;
+ void do_delayed_reset();
- volatile bool display_outdated_;
+ int2 get_effective_tile_size() const;
- volatile bool gpu_draw_ready_;
- volatile bool gpu_need_display_buffer_update_;
- thread_condition_variable gpu_need_display_buffer_update_cond_;
+ thread *session_thread_;
- bool pause_;
- bool cancel_;
- bool new_work_added_;
+ bool pause_ = false;
+ bool cancel_ = false;
+ bool new_work_added_ = false;
thread_condition_variable pause_cond_;
thread_mutex pause_mutex_;
thread_mutex tile_mutex_;
thread_mutex buffers_mutex_;
- thread_mutex display_mutex_;
- thread_condition_variable denoising_cond_;
- thread_condition_variable tile_steal_cond_;
-
- double reset_time_;
- double last_update_time_;
- double last_display_time_;
-
- RenderTile stolen_tile_;
- typedef enum {
- NOT_STEALING, /* There currently is no tile stealing in progress. */
- WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */
- RELEASING_TILE, /* A device has releasing a stealable tile. */
- GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */
- } TileStealingState;
- std::atomic<TileStealingState> tile_stealing_state_;
- int stealable_tiles_;
-
- /* progressive refine */
- bool update_progressive_refine(bool cancel);
+
+ TileManager tile_manager_;
+ BufferParams buffer_params_;
+
+ /* Render scheduler is used to get work to be rendered with the current big tile. */
+ RenderScheduler render_scheduler_;
+
+ /* Path tracer object.
+ *
+ * Is a single full-frame path tracer for interactive viewport rendering.
+ * A path tracer for the current big-tile for an offline rendering. */
+ unique_ptr<PathTrace> path_trace_;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 59b60904746..f6b23606e58 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -203,6 +203,7 @@ Shader::Shader() : Node(get_node_type())
has_surface = false;
has_surface_transparent = false;
has_surface_emission = false;
+ has_surface_raytrace = false;
has_surface_bssrdf = false;
has_volume = false;
has_displacement = false;
@@ -485,7 +486,7 @@ void ShaderManager::device_update(Device *device,
device_update_specific(device, dscene, scene, progress);
}
-void ShaderManager::device_update_common(Device *device,
+void ShaderManager::device_update_common(Device * /*device*/,
DeviceScene *dscene,
Scene *scene,
Progress & /*progress*/)
@@ -508,6 +509,8 @@ void ShaderManager::device_update_common(Device *device,
flag |= SD_HAS_EMISSION;
if (shader->has_surface_transparent && shader->get_use_transparent_shadow())
flag |= SD_HAS_TRANSPARENT_SHADOW;
+ if (shader->has_surface_raytrace)
+ flag |= SD_HAS_RAYTRACE;
if (shader->has_volume) {
flag |= SD_HAS_VOLUME;
has_volumes = true;
@@ -528,12 +531,10 @@ void ShaderManager::device_update_common(Device *device,
flag |= SD_NEED_VOLUME_ATTRIBUTES;
if (shader->has_bssrdf_bump)
flag |= SD_HAS_BSSRDF_BUMP;
- if (device->info.has_volume_decoupled) {
- if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
- flag |= SD_VOLUME_EQUIANGULAR;
- if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
- flag |= SD_VOLUME_MIS;
- }
+ if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
+ flag |= SD_VOLUME_EQUIANGULAR;
+ if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+ flag |= SD_VOLUME_MIS;
if (shader->get_volume_interpolation_method() == VOLUME_INTERPOLATION_CUBIC)
flag |= SD_VOLUME_CUBIC;
if (shader->has_bump)
@@ -682,39 +683,35 @@ void ShaderManager::add_default(Scene *scene)
}
}
-void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
- DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_graph_kernel_features(ShaderGraph *graph)
{
+ uint kernel_features = 0;
+
foreach (ShaderNode *node, graph->nodes) {
- requested_features->max_nodes_group = max(requested_features->max_nodes_group,
- node->get_group());
- requested_features->nodes_features |= node->get_feature();
+ kernel_features |= node->get_feature();
if (node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
BsdfBaseNode *bsdf_node = static_cast<BsdfBaseNode *>(node);
if (CLOSURE_IS_VOLUME(bsdf_node->get_closure_type())) {
- requested_features->nodes_features |= NODE_FEATURE_VOLUME;
+ kernel_features |= KERNEL_FEATURE_NODE_VOLUME;
}
else if (CLOSURE_IS_PRINCIPLED(bsdf_node->get_closure_type())) {
- requested_features->use_principled = true;
+ kernel_features |= KERNEL_FEATURE_PRINCIPLED;
}
}
if (node->has_surface_bssrdf()) {
- requested_features->use_subsurface = true;
+ kernel_features |= KERNEL_FEATURE_SUBSURFACE;
}
if (node->has_surface_transparent()) {
- requested_features->use_transparent = true;
- }
- if (node->has_raytrace()) {
- requested_features->use_shader_raytrace = true;
+ kernel_features |= KERNEL_FEATURE_TRANSPARENT;
}
}
+
+ return kernel_features;
}
-void ShaderManager::get_requested_features(Scene *scene,
- DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_kernel_features(Scene *scene)
{
- requested_features->max_nodes_group = NODE_GROUP_LEVEL_0;
- requested_features->nodes_features = 0;
+ uint kernel_features = KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION;
for (int i = 0; i < scene->shaders.size(); i++) {
Shader *shader = scene->shaders[i];
if (!shader->reference_count()) {
@@ -722,21 +719,22 @@ void ShaderManager::get_requested_features(Scene *scene,
}
/* Gather requested features from all the nodes from the graph nodes. */
- get_requested_graph_features(shader->graph, requested_features);
+ kernel_features |= get_graph_kernel_features(shader->graph);
ShaderNode *output_node = shader->graph->output();
if (output_node->input("Displacement")->link != NULL) {
- requested_features->nodes_features |= NODE_FEATURE_BUMP;
+ kernel_features |= KERNEL_FEATURE_NODE_BUMP;
if (shader->get_displacement_method() == DISPLACE_BOTH) {
- requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
- requested_features->max_nodes_group = max(requested_features->max_nodes_group,
- NODE_GROUP_LEVEL_1);
+ kernel_features |= KERNEL_FEATURE_NODE_BUMP_STATE;
}
}
/* On top of volume nodes, also check if we need volume sampling because
- * e.g. an Emission node would slip through the NODE_FEATURE_VOLUME check */
- if (shader->has_volume)
- requested_features->use_volume |= true;
+ * e.g. an Emission node would slip through the KERNEL_FEATURE_NODE_VOLUME check */
+ if (shader->has_volume) {
+ kernel_features |= KERNEL_FEATURE_VOLUME;
+ }
}
+
+ return kernel_features;
}
void ShaderManager::free_memory()
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index c65cac351a4..5f9adea3949 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
class Device;
class DeviceScene;
-class DeviceRequestedFeatures;
class Mesh;
class Progress;
class Scene;
@@ -117,6 +116,7 @@ class Shader : public Node {
bool has_surface;
bool has_surface_emission;
bool has_surface_transparent;
+ bool has_surface_raytrace;
bool has_volume;
bool has_displacement;
bool has_surface_bssrdf;
@@ -216,7 +216,7 @@ class ShaderManager {
static void add_default(Scene *scene);
/* Selective nodes compilation. */
- void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
+ uint get_kernel_features(Scene *scene);
static void free_memory();
@@ -244,8 +244,7 @@ class ShaderManager {
size_t beckmann_table_offset;
- void get_requested_graph_features(ShaderGraph *graph,
- DeviceRequestedFeatures *requested_features);
+ uint get_graph_kernel_features(ShaderGraph *graph);
thread_spin_lock attribute_lock_;
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 2c6273842e2..73eb7e21ff9 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -264,53 +264,34 @@ void RenderStats::collect_profiling(Scene *scene, Profiler &prof)
has_profiling = true;
kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
-
kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
- kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
-
- NamedNestedSampleStats &integrator = kernel.add_entry("Path integration",
- prof.get_event(PROFILING_PATH_INTEGRATE));
- integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
- integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
- integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
-
- NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
- shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
- shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
- shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
- shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
- shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
-
- integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
- integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
-
- NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
- intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
- intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
- intersection.add_entry("Shadow All Intersection",
- prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
- intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
- intersection.add_entry("Volume All Intersection",
- prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
-
- NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
- closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
- closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
- closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
- closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
-
- NamedNestedSampleStats &denoising = kernel.add_entry("Denoising",
- prof.get_event(PROFILING_DENOISING));
- denoising.add_entry("Construct Transform",
- prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
- denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
-
- NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
- prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
- prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
- prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
- prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
- prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+ kernel.add_entry("Intersect Closest", prof.get_event(PROFILING_INTERSECT_CLOSEST));
+ kernel.add_entry("Intersect Shadow", prof.get_event(PROFILING_INTERSECT_SHADOW));
+ kernel.add_entry("Intersect Subsurface", prof.get_event(PROFILING_INTERSECT_SUBSURFACE));
+ kernel.add_entry("Intersect Volume Stack", prof.get_event(PROFILING_INTERSECT_VOLUME_STACK));
+
+ NamedNestedSampleStats &surface = kernel.add_entry("Shade Surface", 0);
+ surface.add_entry("Setup", prof.get_event(PROFILING_SHADE_SURFACE_SETUP));
+ surface.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_SURFACE_EVAL));
+ surface.add_entry("Render Passes", prof.get_event(PROFILING_SHADE_SURFACE_PASSES));
+ surface.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_SURFACE_DIRECT_LIGHT));
+ surface.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT));
+ surface.add_entry("Ambient Occlusion", prof.get_event(PROFILING_SHADE_SURFACE_AO));
+
+ NamedNestedSampleStats &volume = kernel.add_entry("Shade Volume", 0);
+ volume.add_entry("Setup", prof.get_event(PROFILING_SHADE_VOLUME_SETUP));
+ volume.add_entry("Integrate", prof.get_event(PROFILING_SHADE_VOLUME_INTEGRATE));
+ volume.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_VOLUME_DIRECT_LIGHT));
+ volume.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_VOLUME_INDIRECT_LIGHT));
+
+ NamedNestedSampleStats &shadow = kernel.add_entry("Shade Shadow", 0);
+ shadow.add_entry("Setup", prof.get_event(PROFILING_SHADE_SHADOW_SETUP));
+ shadow.add_entry("Surface", prof.get_event(PROFILING_SHADE_SHADOW_SURFACE));
+ shadow.add_entry("Volume", prof.get_event(PROFILING_SHADE_SHADOW_VOLUME));
+
+ NamedNestedSampleStats &light = kernel.add_entry("Shade Light", 0);
+ light.add_entry("Setup", prof.get_event(PROFILING_SHADE_LIGHT_SETUP));
+ light.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_LIGHT_EVAL));
shaders.entries.clear();
foreach (Shader *shader, scene->shaders) {
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index dcb3976e15c..2379eb775a0 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -446,6 +446,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
if (current_type == SHADER_TYPE_SURFACE) {
if (node->has_spatial_varying())
current_shader->has_surface_spatial_varying = true;
+ if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+ current_shader->has_surface_raytrace = true;
}
else if (current_type == SHADER_TYPE_VOLUME) {
if (node->has_spatial_varying())
@@ -492,6 +494,13 @@ void SVMCompiler::generate_svm_nodes(const ShaderNodeSet &nodes, CompilerState *
void SVMCompiler::generate_closure_node(ShaderNode *node, CompilerState *state)
{
+ /* Skip generating closure that are not supported or needed for a particular
+ * type of shader. For example a BSDF in a volume shader. */
+ const int node_feature = node->get_feature();
+ if ((state->node_feature_mask & node_feature) != node_feature) {
+ return;
+ }
+
/* execute dependencies for closure */
foreach (ShaderInput *in, node->inputs) {
if (in->link != NULL) {
@@ -555,7 +564,7 @@ void SVMCompiler::find_aov_nodes_and_dependencies(ShaderNodeSet &aov_nodes,
foreach (ShaderNode *node, graph->nodes) {
if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node);
- if (aov_node->slot >= 0) {
+ if (aov_node->offset >= 0) {
aov_nodes.insert(aov_node);
foreach (ShaderInput *in, node->inputs) {
if (in->link != NULL) {
@@ -785,17 +794,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
case SHADER_TYPE_SURFACE: /* generate surface shader */
generate = true;
shader->has_surface = true;
+ state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE;
break;
case SHADER_TYPE_VOLUME: /* generate volume shader */
generate = true;
shader->has_volume = true;
+ state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_VOLUME;
break;
case SHADER_TYPE_DISPLACEMENT: /* generate displacement shader */
generate = true;
shader->has_displacement = true;
+ state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_DISPLACEMENT;
break;
case SHADER_TYPE_BUMP: /* generate bump shader */
generate = true;
+ state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_BUMP;
break;
default:
break;
@@ -867,6 +880,7 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum
shader->has_surface = false;
shader->has_surface_emission = false;
shader->has_surface_transparent = false;
+ shader->has_surface_raytrace = false;
shader->has_surface_bssrdf = false;
shader->has_bump = has_bump;
shader->has_bssrdf_bump = has_bump;
@@ -964,6 +978,7 @@ SVMCompiler::CompilerState::CompilerState(ShaderGraph *graph)
max_id = max(node->id, max_id);
}
nodes_done_flag.resize(max_id + 1, false);
+ node_feature_mask = 0;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index d23ff3e2a47..0353c393ae4 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -192,6 +192,9 @@ class SVMCompiler {
* all areas to use this flags array.
*/
vector<bool> nodes_done_flag;
+
+ /* Node features that can be compiled. */
+ uint node_feature_mask;
};
void stack_clear_temporary(ShaderNode *node);
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 375c9fd8e09..28910bffa7b 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -16,601 +16,559 @@
#include "render/tile.h"
+#include <atomic>
+
+#include "graph/node.h"
+#include "render/background.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/scene.h"
#include "util/util_algorithm.h"
#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
#include "util/util_types.h"
CCL_NAMESPACE_BEGIN
-namespace {
+/* --------------------------------------------------------------------
+ * Internal functions.
+ */
-class TileComparator {
- public:
- TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
- : order(order_), center(center_), tiles(tiles_)
- {
- }
+static const char *ATTR_PASSES_COUNT = "cycles.passes.count";
+static const char *ATTR_PASS_SOCKET_PREFIX_FORMAT = "cycles.passes.%d.";
+static const char *ATTR_BUFFER_SOCKET_PREFIX = "cycles.buffer.";
+static const char *ATTR_DENOISE_SOCKET_PREFIX = "cycles.denoise.";
- bool operator()(int a, int b)
- {
- switch (order) {
- case TILE_CENTER: {
- float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w / 2),
- center.y - (tiles[a].y + tiles[a].h / 2));
- float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w / 2),
- center.y - (tiles[b].y + tiles[b].h / 2));
- return dot(dist_a, dist_a) < dot(dist_b, dist_b);
- }
- case TILE_LEFT_TO_RIGHT:
- return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x < tiles[b].x);
- case TILE_RIGHT_TO_LEFT:
- return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x > tiles[b].x);
- case TILE_TOP_TO_BOTTOM:
- return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y > tiles[b].y);
- case TILE_BOTTOM_TO_TOP:
- default:
- return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y < tiles[b].y);
+/* Global counter of ToleManager object instances. */
+static std::atomic<uint64_t> g_instance_index = 0;
+
+/* Construct names of EXR channels which will ensure order of all channels to match exact offsets
+ * in render buffers corresponding to the given passes.
+ *
+ * Returns `std` datatypes so that it can be assigned directly to the OIIO's `ImageSpec`. */
+static std::vector<std::string> exr_channel_names_for_passes(const BufferParams &buffer_params)
+{
+ static const char *component_suffixes[] = {"R", "G", "B", "A"};
+
+ int pass_index = 0;
+ int num_channels = 0;
+ std::vector<std::string> channel_names;
+ for (const BufferPass &pass : buffer_params.passes) {
+ if (pass.offset == PASS_UNUSED) {
+ continue;
}
- }
- protected:
- TileOrder order;
- int2 center;
- Tile *tiles;
-};
+ const PassInfo pass_info = pass.get_info();
+ num_channels += pass_info.num_components;
-inline int2 hilbert_index_to_pos(int n, int d)
-{
- int2 r, xy = make_int2(0, 0);
- for (int s = 1; s < n; s *= 2) {
- r.x = (d >> 1) & 1;
- r.y = (d ^ r.x) & 1;
- if (!r.y) {
- if (r.x) {
- xy = make_int2(s - 1, s - 1) - xy;
- }
- swap(xy.x, xy.y);
+ /* EXR canonically expects first part of channel names to be sorted alphabetically, which is
+ * not guaranteed to be the case with passes names. Assign a prefix based on the pass index
+ * with a fixed width to ensure ordering. This makes it possible to dump existing render
+ * buffers memory to disk and read it back without doing extra mapping. */
+ const string prefix = string_printf("%08d", pass_index);
+
+ const string channel_name_prefix = prefix + string(pass.name) + ".";
+
+ for (int i = 0; i < pass_info.num_components; ++i) {
+ channel_names.push_back(channel_name_prefix + component_suffixes[i]);
}
- xy += r * make_int2(s, s);
- d >>= 2;
+
+ ++pass_index;
}
- return xy;
+
+ return channel_names;
}
-enum SpiralDirection {
- DIRECTION_UP,
- DIRECTION_LEFT,
- DIRECTION_DOWN,
- DIRECTION_RIGHT,
-};
-
-} /* namespace */
-
-TileManager::TileManager(bool progressive_,
- int num_samples_,
- int2 tile_size_,
- int start_resolution_,
- bool preserve_tile_device_,
- bool background_,
- TileOrder tile_order_,
- int num_devices_,
- int pixel_size_)
+inline string node_socket_attribute_name(const SocketType &socket, const string &attr_name_prefix)
{
- progressive = progressive_;
- tile_size = tile_size_;
- tile_order = tile_order_;
- start_resolution = start_resolution_;
- pixel_size = pixel_size_;
- slice_overlap = 0;
- num_samples = num_samples_;
- num_devices = num_devices_;
- preserve_tile_device = preserve_tile_device_;
- background = background_;
- schedule_denoising = false;
-
- range_start_sample = 0;
- range_num_samples = -1;
-
- BufferParams buffer_params;
- reset(buffer_params, 0);
+ return attr_name_prefix + string(socket.name);
}
-TileManager::~TileManager()
+template<typename ValidateValueFunc, typename GetValueFunc>
+static bool node_socket_generic_to_image_spec_atttributes(
+ ImageSpec *image_spec,
+ const Node *node,
+ const SocketType &socket,
+ const string &attr_name_prefix,
+ const ValidateValueFunc &validate_value_func,
+ const GetValueFunc &get_value_func)
{
+ if (!validate_value_func(node, socket)) {
+ return false;
+ }
+
+ image_spec->attribute(node_socket_attribute_name(socket, attr_name_prefix),
+ get_value_func(node, socket));
+
+ return true;
}
-void TileManager::device_free()
+static bool node_socket_to_image_spec_atttributes(ImageSpec *image_spec,
+ const Node *node,
+ const SocketType &socket,
+ const string &attr_name_prefix)
{
- if (schedule_denoising || progressive) {
- for (int i = 0; i < state.tiles.size(); i++) {
- delete state.tiles[i].buffers;
- state.tiles[i].buffers = NULL;
+ const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+ switch (socket.type) {
+ case SocketType::ENUM: {
+ const ustring value = node->get_string(socket);
+
+ /* Validate that the node is consistent with the node type definition. */
+ const NodeEnum &enum_values = *socket.enum_values;
+ if (!enum_values.exists(value)) {
+ LOG(DFATAL) << "Node enum contains invalid value " << value;
+ return false;
+ }
+
+ image_spec->attribute(attr_name, value);
+
+ return true;
}
- }
- state.tiles.clear();
+ case SocketType::STRING:
+ image_spec->attribute(attr_name, node->get_string(socket));
+ return true;
+
+ case SocketType::INT:
+ image_spec->attribute(attr_name, node->get_int(socket));
+ return true;
+
+ case SocketType::FLOAT:
+ image_spec->attribute(attr_name, node->get_float(socket));
+ return true;
+
+ case SocketType::BOOLEAN:
+ image_spec->attribute(attr_name, node->get_bool(socket));
+ return true;
+
+ default:
+ LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+ return false;
+ }
}
-static int get_divider(int w, int h, int start_resolution)
+static bool node_socket_from_image_spec_atttributes(Node *node,
+ const SocketType &socket,
+ const ImageSpec &image_spec,
+ const string &attr_name_prefix)
{
- int divider = 1;
- if (start_resolution != INT_MAX) {
- while (w * h > start_resolution * start_resolution) {
- w = max(1, w / 2);
- h = max(1, h / 2);
+ const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+ switch (socket.type) {
+ case SocketType::ENUM: {
+ /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+ const ustring value(image_spec.get_string_attribute(attr_name, ""));
+
+ /* Validate that the node is consistent with the node type definition. */
+ const NodeEnum &enum_values = *socket.enum_values;
+ if (!enum_values.exists(value)) {
+ LOG(ERROR) << "Invalid enumerator value " << value;
+ return false;
+ }
- divider <<= 1;
+ node->set(socket, enum_values[value]);
+
+ return true;
}
+
+ case SocketType::STRING:
+ /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+ node->set(socket, ustring(image_spec.get_string_attribute(attr_name, "")));
+ return true;
+
+ case SocketType::INT:
+ node->set(socket, image_spec.get_int_attribute(attr_name, 0));
+ return true;
+
+ case SocketType::FLOAT:
+ node->set(socket, image_spec.get_float_attribute(attr_name, 0));
+ return true;
+
+ case SocketType::BOOLEAN:
+ node->set(socket, static_cast<bool>(image_spec.get_int_attribute(attr_name, 0)));
+ return true;
+
+ default:
+ LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+ return false;
}
- return divider;
}
-void TileManager::reset(BufferParams &params_, int num_samples_)
+static bool node_to_image_spec_atttributes(ImageSpec *image_spec,
+ const Node *node,
+ const string &attr_name_prefix)
{
- params = params_;
-
- set_samples(num_samples_);
-
- state.buffer = BufferParams();
- state.sample = range_start_sample - 1;
- state.num_tiles = 0;
- state.num_samples = 0;
- state.resolution_divider = get_divider(params.width, params.height, start_resolution);
- state.render_tiles.clear();
- state.denoising_tiles.clear();
- device_free();
+ for (const SocketType &socket : node->type->inputs) {
+ if (!node_socket_to_image_spec_atttributes(image_spec, node, socket, attr_name_prefix)) {
+ return false;
+ }
+ }
+
+ return true;
}
-void TileManager::set_samples(int num_samples_)
+static bool node_from_image_spec_atttributes(Node *node,
+ const ImageSpec &image_spec,
+ const string &attr_name_prefix)
{
- num_samples = num_samples_;
+ for (const SocketType &socket : node->type->inputs) {
+ if (!node_socket_from_image_spec_atttributes(node, socket, image_spec, attr_name_prefix)) {
+ return false;
+ }
+ }
+
+ return true;
+}
- /* No real progress indication is possible when using unlimited samples. */
- if (num_samples == INT_MAX) {
- state.total_pixel_samples = 0;
+static bool buffer_params_to_image_spec_atttributes(ImageSpec *image_spec,
+ const BufferParams &buffer_params)
+{
+ if (!node_to_image_spec_atttributes(image_spec, &buffer_params, ATTR_BUFFER_SOCKET_PREFIX)) {
+ return false;
}
- else {
- uint64_t pixel_samples = 0;
- /* While rendering in the viewport, the initial preview resolution is increased to the native
- * resolution before the actual rendering begins. Therefore, additional pixel samples will be
- * rendered. */
- int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
- while (divider > pixel_size) {
- int image_w = max(1, params.width / divider);
- int image_h = max(1, params.height / divider);
- pixel_samples += image_w * image_h;
- divider >>= 1;
- }
- int image_w = max(1, params.width / divider);
- int image_h = max(1, params.height / divider);
- state.total_pixel_samples = pixel_samples +
- (uint64_t)get_num_effective_samples() * image_w * image_h;
- if (schedule_denoising) {
- state.total_pixel_samples += params.width * params.height;
+ /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+ const int num_passes = buffer_params.passes.size();
+ image_spec->attribute(ATTR_PASSES_COUNT, num_passes);
+
+ for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+ const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
+
+ const BufferPass *pass = &buffer_params.passes[pass_index];
+ if (!node_to_image_spec_atttributes(image_spec, pass, attr_name_prefix)) {
+ return false;
}
}
+
+ return true;
}
-/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render
- * device. If sliced is true, slice image into as much pieces as how many devices are rendering
- * this image. */
-int TileManager::gen_tiles(bool sliced)
+static bool buffer_params_from_image_spec_atttributes(BufferParams *buffer_params,
+ const ImageSpec &image_spec)
{
- int resolution = state.resolution_divider;
- int image_w = max(1, params.width / resolution);
- int image_h = max(1, params.height / resolution);
- int2 center = make_int2(image_w / 2, image_h / 2);
-
- int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
- int slice_num = sliced ? num : 1;
- int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-
- device_free();
- state.render_tiles.clear();
- state.denoising_tiles.clear();
- state.render_tiles.resize(num);
- state.denoising_tiles.resize(num);
- state.tile_stride = tile_w;
- vector<list<int>>::iterator tile_list;
- tile_list = state.render_tiles.begin();
-
- if (tile_order == TILE_HILBERT_SPIRAL) {
- assert(!sliced && slice_overlap == 0);
-
- int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
- state.tiles.resize(tile_w * tile_h);
-
- /* Size of blocks in tiles, must be a power of 2 */
- const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12) ? 8 : 4;
-
- int tiles_per_device = divide_up(tile_w * tile_h, num);
- int cur_device = 0, cur_tiles = 0;
-
- int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
- /* Number of blocks to fill the image */
- int blocks_x = (block_size.x >= image_w) ? 1 : divide_up(image_w, block_size.x);
- int blocks_y = (block_size.y >= image_h) ? 1 : divide_up(image_h, block_size.y);
- int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
- /* Offset of spiral (to keep it centered) */
- int2 offset = make_int2((image_w - n * block_size.x) / 2, (image_h - n * block_size.y) / 2);
- offset = (offset / tile_size) * tile_size; /* Round to tile border. */
-
- int2 block = make_int2(0, 0); /* Current block */
- SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP;
- for (int i = 0;;) {
- /* Generate the tiles in the current block. */
- for (int hilbert_index = 0; hilbert_index < hilbert_size * hilbert_size; hilbert_index++) {
- int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index);
- /* Rotate block according to spiral direction. */
- if (prev_dir == DIRECTION_UP && dir == DIRECTION_UP) {
- tile = make_int2(hilbert_pos.y, hilbert_pos.x);
- }
- else if (dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) {
- tile = hilbert_pos;
- }
- else if (dir == DIRECTION_DOWN) {
- tile = make_int2(hilbert_size - 1 - hilbert_pos.y, hilbert_size - 1 - hilbert_pos.x);
- }
- else {
- tile = make_int2(hilbert_size - 1 - hilbert_pos.x, hilbert_size - 1 - hilbert_pos.y);
- }
-
- int2 pos = block * block_size + tile * tile_size + offset;
- /* Only add tiles which are in the image (tiles outside of the image can be generated since
- * the spiral is always square). */
- if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
- int w = min(tile_size.x, image_w - pos.x);
- int h = min(tile_size.y, image_h - pos.y);
- int2 ipos = pos / tile_size;
- int idx = ipos.y * tile_w + ipos.x;
- state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
- tile_list->push_front(idx);
- cur_tiles++;
-
- if (cur_tiles == tiles_per_device) {
- tile_list++;
- cur_tiles = 0;
- cur_device++;
- }
- }
- }
+ if (!node_from_image_spec_atttributes(buffer_params, image_spec, ATTR_BUFFER_SOCKET_PREFIX)) {
+ return false;
+ }
- /* Stop as soon as the spiral has reached the center block. */
- if (block.x == (n - 1) / 2 && block.y == (n - 1) / 2)
- break;
-
- /* Advance to next block. */
- prev_dir = dir;
- switch (dir) {
- case DIRECTION_UP:
- block.y++;
- if (block.y == (n - i - 1)) {
- dir = DIRECTION_LEFT;
- }
- break;
- case DIRECTION_LEFT:
- block.x++;
- if (block.x == (n - i - 1)) {
- dir = DIRECTION_DOWN;
- }
- break;
- case DIRECTION_DOWN:
- block.y--;
- if (block.y == i) {
- dir = DIRECTION_RIGHT;
- }
- break;
- case DIRECTION_RIGHT:
- block.x--;
- if (block.x == i + 1) {
- dir = DIRECTION_UP;
- i++;
- }
- break;
- }
- }
- return tile_w * tile_h;
+ /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+ const int num_passes = image_spec.get_int_attribute(ATTR_PASSES_COUNT, 0);
+ if (num_passes == 0) {
+ LOG(ERROR) << "Missing passes count attribute.";
+ return false;
}
- int idx = 0;
- for (int slice = 0; slice < slice_num; slice++) {
- int slice_y = (image_h / slice_num) * slice;
- int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
- image_h / slice_num;
+ for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+ const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
- if (slice_overlap != 0) {
- int slice_y_offset = max(slice_y - slice_overlap, 0);
- slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
- slice_y = slice_y_offset;
- }
+ BufferPass pass;
- int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
- int tiles_per_device = divide_up(tile_w * tile_h, num);
- int cur_device = 0, cur_tiles = 0;
-
- for (int tile_y = 0; tile_y < tile_h; tile_y++) {
- for (int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
- int x = tile_x * tile_size.x;
- int y = tile_y * tile_size.y;
- int w = (tile_x == tile_w - 1) ? image_w - x : tile_size.x;
- int h = (tile_y == tile_h - 1) ? slice_h - y : tile_size.y;
-
- state.tiles.push_back(
- Tile(idx, x, y + slice_y, w, h, sliced ? slice : cur_device, Tile::RENDER));
- tile_list->push_back(idx);
-
- if (!sliced) {
- cur_tiles++;
-
- if (cur_tiles == tiles_per_device) {
- /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that
- * case. */
- if (tile_order != TILE_BOTTOM_TO_TOP) {
- tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
- }
- tile_list++;
- cur_tiles = 0;
- cur_device++;
- }
- }
- }
- }
- if (sliced) {
- tile_list++;
+ if (!node_from_image_spec_atttributes(&pass, image_spec, attr_name_prefix)) {
+ return false;
}
+
+ buffer_params->passes.emplace_back(std::move(pass));
}
- return idx;
+ buffer_params->update_passes();
+
+ return true;
}
-void TileManager::gen_render_tiles()
+/* Configure image specification for the given buffer parameters and passes.
+ *
+ * Image channels will be strictly ordered to match content of corresponding buffer, and the
+ * metadata will be set so that the render buffers and passes can be reconstructed from it.
+ *
+ * If the tile size different from (0, 0) the image specification will be configured to use the
+ * given tile size for tiled IO. */
+static bool configure_image_spec_from_buffer(ImageSpec *image_spec,
+ const BufferParams &buffer_params,
+ const int2 tile_size = make_int2(0, 0))
{
- /* Regenerate just the render tiles for progressive render. */
- foreach (Tile &tile, state.tiles) {
- tile.state = Tile::RENDER;
- state.render_tiles[tile.device].push_back(tile.index);
+ const std::vector<std::string> channel_names = exr_channel_names_for_passes(buffer_params);
+ const int num_channels = channel_names.size();
+
+ *image_spec = ImageSpec(
+ buffer_params.width, buffer_params.height, num_channels, TypeDesc::FLOAT);
+
+ image_spec->channelnames = move(channel_names);
+
+ if (!buffer_params_to_image_spec_atttributes(image_spec, buffer_params)) {
+ return false;
+ }
+
+ if (tile_size.x != 0 || tile_size.y != 0) {
+ DCHECK_GT(tile_size.x, 0);
+ DCHECK_GT(tile_size.y, 0);
+
+ image_spec->tile_width = tile_size.x;
+ image_spec->tile_height = tile_size.y;
}
+
+ return true;
}
-void TileManager::set_tiles()
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
+
+TileManager::TileManager()
{
- int resolution = state.resolution_divider;
- int image_w = max(1, params.width / resolution);
- int image_h = max(1, params.height / resolution);
+ /* Use process ID to separate different processes.
+ * To ensure uniqueness from within a process use combination of object address and instance
+ * index. This solves problem of possible object re-allocation at the same time, and solves
+ * possible conflict when the counter overflows while there are still active instances of the
+ * class. */
+ const int tile_manager_id = g_instance_index.fetch_add(1, std::memory_order_relaxed);
+ tile_file_unique_part_ = to_string(system_self_process_id()) + "-" +
+ to_string(reinterpret_cast<uintptr_t>(this)) + "-" +
+ to_string(tile_manager_id);
+}
- state.num_tiles = gen_tiles(!background);
+TileManager::~TileManager()
+{
+}
+
+void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
+{
+ VLOG(3) << "Using tile size of " << tile_size;
+
+ close_tile_output();
+
+ tile_size_ = tile_size;
+
+ tile_state_.num_tiles_x = divide_up(params.width, tile_size_.x);
+ tile_state_.num_tiles_y = divide_up(params.height, tile_size_.y);
+ tile_state_.num_tiles = tile_state_.num_tiles_x * tile_state_.num_tiles_y;
+
+ tile_state_.next_tile_index = 0;
+
+ tile_state_.current_tile = Tile();
+}
+
+void TileManager::update(const BufferParams &params, const Scene *scene)
+{
+ DCHECK_NE(params.pass_stride, -1);
+
+ buffer_params_ = params;
- state.buffer.width = image_w;
- state.buffer.height = image_h;
+ /* TODO(sergey): Proper Error handling, so that if configuration has failed we don't attempt to
+ * write to a partially configured file. */
+ configure_image_spec_from_buffer(&write_state_.image_spec, buffer_params_, tile_size_);
- state.buffer.full_x = params.full_x / resolution;
- state.buffer.full_y = params.full_y / resolution;
- state.buffer.full_width = max(1, params.full_width / resolution);
- state.buffer.full_height = max(1, params.full_height / resolution);
+ const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+ node_to_image_spec_atttributes(
+ &write_state_.image_spec, &denoise_params, ATTR_DENOISE_SOCKET_PREFIX);
}
-int TileManager::get_neighbor_index(int index, int neighbor)
+bool TileManager::done()
{
- /* Neighbor indices:
- * 0 1 2
- * 3 4 5
- * 6 7 8
- */
- static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
- static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
-
- int resolution = state.resolution_divider;
- int image_w = max(1, params.width / resolution);
- int image_h = max(1, params.height / resolution);
-
- int num = min(image_h, num_devices);
- int slice_num = !background ? num : 1;
- int slice_h = image_h / slice_num;
-
- int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
- int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
- /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
- int nx = (index % tile_w) + dx[neighbor];
- int ny = (index / tile_w) + dy[neighbor];
- if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
- return -1;
-
- return ny * state.tile_stride + nx;
+ return tile_state_.next_tile_index == tile_state_.num_tiles;
}
-/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state
- * min_state. */
-bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+bool TileManager::next()
{
- if (index < 0 || state.tiles[index].state < min_state) {
+ if (done()) {
return false;
}
- for (int neighbor = 0; neighbor < 9; neighbor++) {
- int nindex = get_neighbor_index(index, neighbor);
- /* Out-of-bounds tiles don't matter. */
- if (nindex >= 0 && state.tiles[nindex].state < min_state) {
- return false;
- }
- }
+
+ tile_state_.current_tile = get_tile_for_index(tile_state_.next_tile_index);
+
+ ++tile_state_.next_tile_index;
return true;
}
-/* Returns whether the tile should be written (and freed if no denoising is used) instead of
- * updating. */
-bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile)
+Tile TileManager::get_tile_for_index(int index) const
{
- delete_tile = false;
-
- switch (state.tiles[index].state) {
- case Tile::RENDER: {
- if (!(schedule_denoising && need_denoise)) {
- state.tiles[index].state = Tile::DONE;
- delete_tile = !progressive;
- return true;
- }
- state.tiles[index].state = Tile::RENDERED;
- /* For each neighbor and the tile itself, check whether all of its neighbors have been
- * rendered. If yes, it can be denoised. */
- for (int neighbor = 0; neighbor < 9; neighbor++) {
- int nindex = get_neighbor_index(index, neighbor);
- if (check_neighbor_state(nindex, Tile::RENDERED)) {
- state.tiles[nindex].state = Tile::DENOISE;
- state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
- }
- }
- return false;
- }
- case Tile::DENOISE: {
- state.tiles[index].state = Tile::DENOISED;
- /* For each neighbor and the tile itself, check whether all of its neighbors have been
- * denoised. If yes, it can be freed. */
- for (int neighbor = 0; neighbor < 9; neighbor++) {
- int nindex = get_neighbor_index(index, neighbor);
- if (check_neighbor_state(nindex, Tile::DENOISED)) {
- state.tiles[nindex].state = Tile::DONE;
- /* Do not delete finished tiles in progressive mode. */
- if (!progressive) {
- /* It can happen that the tile just finished denoising and already can be freed here.
- * However, in that case it still has to be written before deleting, so we can't delete
- * it yet. */
- if (neighbor == 4) {
- delete_tile = true;
- }
- else {
- delete state.tiles[nindex].buffers;
- state.tiles[nindex].buffers = NULL;
- }
- }
- }
- }
- return true;
- }
- default:
- assert(false);
- return true;
+ /* TODO(sergey): Consider using hilbert spiral, or. maybe, even configurable. Not sure this
+ * brings a lot of value since this is only applicable to BIG tiles. */
+
+ const int tile_y = index / tile_state_.num_tiles_x;
+ const int tile_x = index - tile_y * tile_state_.num_tiles_x;
+
+ Tile tile;
+
+ tile.x = tile_x * tile_size_.x;
+ tile.y = tile_y * tile_size_.y;
+ tile.width = tile_size_.x;
+ tile.height = tile_size_.y;
+
+ tile.width = min(tile.width, buffer_params_.width - tile.x);
+ tile.height = min(tile.height, buffer_params_.height - tile.y);
+
+ return tile;
+}
+
+const Tile &TileManager::get_current_tile() const
+{
+ return tile_state_.current_tile;
+}
+
+bool TileManager::open_tile_output()
+{
+ write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+ to_string(write_state_.tile_file_index) + ".exr");
+
+ write_state_.tile_out = ImageOutput::create(write_state_.filename);
+ if (!write_state_.tile_out) {
+ LOG(ERROR) << "Error creating image output for " << write_state_.filename;
+ return false;
+ }
+
+ if (!write_state_.tile_out->supports("tiles")) {
+ LOG(ERROR) << "Progress tile file format does not support tiling.";
+ return false;
}
+
+ write_state_.tile_out->open(write_state_.filename, write_state_.image_spec);
+ write_state_.num_tiles_written = 0;
+
+ VLOG(3) << "Opened tile file " << write_state_.filename;
+
+ return true;
}
-bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
+bool TileManager::close_tile_output()
{
- /* Preserve device if requested, unless this is a separate denoising device that just wants to
- * grab any available tile. */
- const bool preserve_device = preserve_tile_device && device < num_devices;
-
- if (tile_types & RenderTile::DENOISE) {
- int tile_index = -1;
- int logical_device = preserve_device ? device : 0;
-
- while (logical_device < state.denoising_tiles.size()) {
- if (state.denoising_tiles[logical_device].empty()) {
- if (preserve_device) {
- break;
- }
- else {
- logical_device++;
- continue;
- }
- }
+ if (!write_state_.tile_out) {
+ return true;
+ }
- tile_index = state.denoising_tiles[logical_device].front();
- state.denoising_tiles[logical_device].pop_front();
- break;
- }
+ const bool success = write_state_.tile_out->close();
+ write_state_.tile_out = nullptr;
- if (tile_index >= 0) {
- tile = &state.tiles[tile_index];
- return true;
- }
+ if (!success) {
+ LOG(ERROR) << "Error closing tile file.";
+ return false;
}
- if (tile_types & RenderTile::PATH_TRACE) {
- int tile_index = -1;
- int logical_device = preserve_device ? device : 0;
-
- while (logical_device < state.render_tiles.size()) {
- if (state.render_tiles[logical_device].empty()) {
- if (preserve_device) {
- break;
- }
- else {
- logical_device++;
- continue;
- }
- }
+ VLOG(3) << "Tile output is closed.";
- tile_index = state.render_tiles[logical_device].front();
- state.render_tiles[logical_device].pop_front();
- break;
+ return true;
+}
+
+bool TileManager::write_tile(const RenderBuffers &tile_buffers)
+{
+ if (!write_state_.tile_out) {
+ if (!open_tile_output()) {
+ return false;
}
+ }
- if (tile_index >= 0) {
- tile = &state.tiles[tile_index];
- return true;
+ DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);
+
+ const BufferParams &tile_params = tile_buffers.params;
+
+ vector<float> pixel_storage;
+ const float *pixels = tile_buffers.buffer.data();
+
+ /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with
+ * empty pixels for tiles which are on the image boundary. */
+ if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) {
+ const int64_t pass_stride = tile_params.pass_stride;
+ const int64_t src_row_stride = tile_params.width * pass_stride;
+
+ const int64_t dst_row_stride = tile_size_.x * pass_stride;
+ pixel_storage.resize(dst_row_stride * tile_size_.y);
+
+ const float *src = tile_buffers.buffer.data();
+ float *dst = pixel_storage.data();
+ pixels = dst;
+
+ for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) {
+ memcpy(dst, src, src_row_stride * sizeof(float));
}
}
- return false;
-}
+ const int tile_x = tile_params.full_x - buffer_params_.full_x;
+ const int tile_y = tile_params.full_y - buffer_params_.full_y;
-bool TileManager::done()
-{
- int end_sample = (range_num_samples == -1) ? num_samples :
- range_start_sample + range_num_samples;
- return (state.resolution_divider == pixel_size) &&
- (state.sample + state.num_samples >= end_sample);
+ VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;
+ if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) {
+ LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror();
+ }
+
+ ++write_state_.num_tiles_written;
+
+ return true;
}
-bool TileManager::has_tiles()
+void TileManager::finish_write_tiles()
{
- foreach (Tile &tile, state.tiles) {
- if (tile.state != Tile::DONE) {
- return true;
+ if (!write_state_.tile_out) {
+ /* None of the tiles were written hence the file was not created.
+ * Avoid creation of fully empty file since it is redundant. */
+ return;
+ }
+
+ /* EXR expects all tiles to present in file. So explicitly write missing tiles as all-zero. */
+ if (write_state_.num_tiles_written < tile_state_.num_tiles) {
+ vector<float> pixel_storage(tile_size_.x * tile_size_.y * buffer_params_.pass_stride);
+
+ for (int tile_index = write_state_.num_tiles_written; tile_index < tile_state_.num_tiles;
+ ++tile_index) {
+ const Tile tile = get_tile_for_index(tile_index);
+
+ VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y;
+
+ write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data());
}
}
- return false;
+
+ close_tile_output();
+
+ if (full_buffer_written_cb) {
+ full_buffer_written_cb(write_state_.filename);
+ }
+
+ /* Advance the counter upon explicit finish of the file.
+ * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments
+ * of the tile-file-within-session index. */
+ ++write_state_.tile_file_index;
+
+ write_state_.filename = "";
}
-bool TileManager::next()
+bool TileManager::read_full_buffer_from_disk(const string_view filename,
+ RenderBuffers *buffers,
+ DenoiseParams *denoise_params)
{
- if (done())
+ unique_ptr<ImageInput> in(ImageInput::open(filename));
+ if (!in) {
+ LOG(ERROR) << "Error opening tile file " << filename;
return false;
+ }
+
+ const ImageSpec &image_spec = in->spec();
- if (progressive && state.resolution_divider > pixel_size) {
- state.sample = 0;
- state.resolution_divider = max(state.resolution_divider / 2, pixel_size);
- state.num_samples = 1;
- set_tiles();
+ BufferParams buffer_params;
+ if (!buffer_params_from_image_spec_atttributes(&buffer_params, image_spec)) {
+ return false;
}
- else {
- state.sample++;
+ buffers->reset(buffer_params);
- if (progressive)
- state.num_samples = 1;
- else if (range_num_samples == -1)
- state.num_samples = num_samples;
- else
- state.num_samples = range_num_samples;
+ if (!node_from_image_spec_atttributes(denoise_params, image_spec, ATTR_DENOISE_SOCKET_PREFIX)) {
+ return false;
+ }
- state.resolution_divider = pixel_size;
+ if (!in->read_image(TypeDesc::FLOAT, buffers->buffer.data())) {
+ LOG(ERROR) << "Error reading pixels from the tile file " << in->geterror();
+ return false;
+ }
- if (state.sample == range_start_sample) {
- set_tiles();
- }
- else {
- gen_render_tiles();
- }
+ if (!in->close()) {
+ LOG(ERROR) << "Error closing tile file " << in->geterror();
+ return false;
}
return true;
}
-int TileManager::get_num_effective_samples()
-{
- return (range_num_samples == -1) ? num_samples : range_num_samples;
-}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 790a56f9445..71b9e966278 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -14,159 +14,151 @@
* limitations under the License.
*/
-#ifndef __TILE_H__
-#define __TILE_H__
-
-#include <limits.h>
+#pragma once
#include "render/buffers.h"
-#include "util/util_list.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_unique_ptr.h"
CCL_NAMESPACE_BEGIN
-/* Tile */
+class DenoiseParams;
+class Scene;
+
+/* --------------------------------------------------------------------
+ * Tile.
+ */
class Tile {
public:
- int index;
- int x, y, w, h;
- int device;
- /* RENDER: The tile has to be rendered.
- * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
- * DENOISE: The tile can be denoised now.
- * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
- * DONE: The tile is finished and has been freed. */
- typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
- State state;
- RenderBuffers *buffers;
+ int x = 0, y = 0;
+ int width = 0, height = 0;
Tile()
{
}
-
- Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
- : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL)
- {
- }
};
-/* Tile order */
-
-/* Note: this should match enum_tile_order in properties.py */
-enum TileOrder {
- TILE_CENTER = 0,
- TILE_RIGHT_TO_LEFT = 1,
- TILE_LEFT_TO_RIGHT = 2,
- TILE_TOP_TO_BOTTOM = 3,
- TILE_BOTTOM_TO_TOP = 4,
- TILE_HILBERT_SPIRAL = 5,
-};
-
-/* Tile Manager */
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
class TileManager {
public:
- BufferParams params;
-
- struct State {
- vector<Tile> tiles;
- int tile_stride;
- BufferParams buffer;
- int sample;
- int num_samples;
- int resolution_divider;
- int num_tiles;
-
- /* Total samples over all pixels: Generally num_samples*num_pixels,
- * but can be higher due to the initial resolution division for previews. */
- uint64_t total_pixel_samples;
-
- /* These lists contain the indices of the tiles to be rendered/denoised and are used
- * when acquiring a new tile for the device.
- * Each list in each vector is for one logical device. */
- vector<list<int>> render_tiles;
- vector<list<int>> denoising_tiles;
- } state;
-
- int num_samples;
- int slice_overlap;
-
- TileManager(bool progressive,
- int num_samples,
- int2 tile_size,
- int start_resolution,
- bool preserve_tile_device,
- bool background,
- TileOrder tile_order,
- int num_devices = 1,
- int pixel_size = 1);
+ /* This callback is invoked by whenever on-dist tiles storage file is closed after writing. */
+ function<void(string_view)> full_buffer_written_cb;
+
+ TileManager();
~TileManager();
- void device_free();
- void reset(BufferParams &params, int num_samples);
- void set_samples(int num_samples);
+ TileManager(const TileManager &other) = delete;
+ TileManager(TileManager &&other) noexcept = delete;
+ TileManager &operator=(const TileManager &other) = delete;
+ TileManager &operator=(TileManager &&other) = delete;
+
+ /* Reset current progress and start new rendering of the full-frame parameters in tiles of the
+ * given size.
+ * Only touches scheduling-related state of the tile manager. */
+ /* TODO(sergey): Consider using tile area instead of exact size to help dealing with extreme
+ * cases of stretched renders. */
+ void reset_scheduling(const BufferParams &params, int2 tile_size);
+
+ /* Update for the known buffer passes and scene parameters.
+ * Will store all parameters needed for buffers access outside of the scene graph. */
+ void update(const BufferParams &params, const Scene *scene);
+
+ inline int get_num_tiles() const
+ {
+ return tile_state_.num_tiles;
+ }
+
+ inline bool has_multiple_tiles() const
+ {
+ return tile_state_.num_tiles > 1;
+ }
+
bool next();
- bool next_tile(Tile *&tile, int device, uint tile_types);
- bool finish_tile(const int index, const bool need_denoise, bool &delete_tile);
bool done();
- bool has_tiles();
- void set_tile_order(TileOrder tile_order_)
+ const Tile &get_current_tile() const;
+
+ /* Write render buffer of a tile to a file on disk.
+ *
+ * Opens file for write when first tile is written.
+ *
+ * Returns true on success. */
+ bool write_tile(const RenderBuffers &tile_buffers);
+
+ /* Inform the tile manager that no more tiles will be written to disk.
+ * The file will be considered final, all handles to it will be closed. */
+ void finish_write_tiles();
+
+ /* Check whether any tile has been written to disk. */
+ inline bool has_written_tiles() const
{
- tile_order = tile_order_;
+ return write_state_.num_tiles_written != 0;
}
- int get_neighbor_index(int index, int neighbor);
- bool check_neighbor_state(int index, Tile::State state);
+ /* Read full frame render buffer from tiles file on disk.
+ *
+ * Returns true on success. */
+ bool read_full_buffer_from_disk(string_view filename,
+ RenderBuffers *buffers,
+ DenoiseParams *denoise_params);
- /* ** Sample range rendering. ** */
+ protected:
+ /* Get tile configuration for its index.
+ * The tile index must be within [0, state_.tile_state_). */
+ Tile get_tile_for_index(int index) const;
- /* Start sample in the range. */
- int range_start_sample;
+ bool open_tile_output();
+ bool close_tile_output();
- /* Number to samples in the rendering range. */
- int range_num_samples;
+ /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
+ * several sessions. */
+ string tile_file_unique_part_;
- /* Get number of actual samples to render. */
- int get_num_effective_samples();
+ int2 tile_size_ = make_int2(0, 0);
- /* Schedule tiles for denoising after they've been rendered. */
- bool schedule_denoising;
+ BufferParams buffer_params_;
- protected:
- void set_tiles();
-
- bool progressive;
- int2 tile_size;
- TileOrder tile_order;
- int start_resolution;
- int pixel_size;
- int num_devices;
-
- /* in some cases it is important that the same tile will be returned for the same
- * device it was originally generated for (i.e. viewport rendering when buffer is
- * allocating once for tile and then always used by it)
- *
- * in other cases any tile could be handled by any device (i.e. final rendering
- * without progressive refine)
- */
- bool preserve_tile_device;
-
- /* for background render tiles should exactly match render parts generated from
- * blender side, which means image first gets split into tiles and then tiles are
- * assigning to render devices
- *
- * however viewport rendering expects tiles to be allocated in a special way,
- * meaning image is being sliced horizontally first and every device handles
- * its own slice
- */
- bool background;
-
- /* Generate tile list, return number of tiles. */
- int gen_tiles(bool sliced);
- void gen_render_tiles();
+ /* Tile scheduling state. */
+ struct {
+ int num_tiles_x = 0;
+ int num_tiles_y = 0;
+ int num_tiles = 0;
+
+ int next_tile_index;
+
+ Tile current_tile;
+ } tile_state_;
+
+ /* State of tiles writing to a file on disk. */
+ struct {
+ /* Index of a tile file used during the current session.
+ * This number is used for the file name construction, making it possible to render several
+ * scenes throughout duration of the session and keep all results available for later read
+ * access. */
+ int tile_file_index = 0;
+
+ string filename;
+
+ /* Specification of the tile image which corresponds to the buffer parameters.
+ * Contains channels configured according to the passes configuration in the path traces.
+ *
+ * Output images are saved using this specification, input images are expected to have matched
+ * specification. */
+ ImageSpec image_spec;
+
+ /* Output handle for the tile file.
+ *
+ * This file can not be closed until all tiles has been provided, so the handle is stored in
+ * the state and is created whenever writing is requested. */
+ unique_ptr<ImageOutput> tile_out;
+
+ int num_tiles_written = 0;
+ } write_state_;
};
CCL_NAMESPACE_END
-
-#endif /* __TILE_H__ */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 65a692acd03..0f6b435813f 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -32,6 +32,7 @@ set(INC
set(ALL_CYCLES_LIBRARIES
cycles_device
cycles_kernel
+ cycles_integrator
cycles_render
cycles_bvh
cycles_graph
@@ -45,8 +46,12 @@ include_directories(${INC})
cycles_link_directories()
set(SRC
+ integrator_adaptive_sampling_test.cpp
+ integrator_render_scheduler_test.cpp
+ integrator_tile_test.cpp
render_graph_finalize_test.cpp
util_aligned_malloc_test.cpp
+ util_math_test.cpp
util_path_test.cpp
util_string_test.cpp
util_task_test.cpp
diff --git a/intern/cycles/test/integrator_adaptive_sampling_test.cpp b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
new file mode 100644
index 00000000000..3ed6a23125d
--- /dev/null
+++ b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/adaptive_sampling.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(AdaptiveSampling, schedule_samples)
+{
+ AdaptiveSampling adaptive_sampling;
+ adaptive_sampling.use = true;
+ adaptive_sampling.min_samples = 0;
+ adaptive_sampling.adaptive_step = 4;
+
+ for (int sample = 2; sample < 32; ++sample) {
+ for (int num_samples = 8; num_samples < 32; ++num_samples) {
+ const int num_samples_aligned = adaptive_sampling.align_samples(sample, num_samples);
+ /* NOTE: `sample + num_samples_aligned` is the number of samples after rendering, so need
+ * to convert this to the 0-based index of the last sample. */
+ EXPECT_TRUE(adaptive_sampling.need_filter(sample + num_samples_aligned - 1));
+ }
+ }
+}
+
+TEST(AdaptiveSampling, align_samples)
+{
+ AdaptiveSampling adaptive_sampling;
+ adaptive_sampling.use = true;
+ adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+ adaptive_sampling.adaptive_step = 4;
+
+ /* Filtering will happen at the following samples:
+ * 15, 19, 23, 27, 31, 35, 39, 43 */
+
+ /* Requested sample and number of samples will result in number of samples lower than
+ * `min_samples`. */
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 4), 4);
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 7), 7);
+
+ /* Request number of samples higher than the minimum samples before filter, but prior to the
+ * first sample at which filtering will happen. */
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 15), 15);
+
+ /* When rendering many samples from the very beginning, limit number of samples by the first
+ * sample at which filtering is to happen. */
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 16), 16);
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 17), 16);
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 20), 16);
+ EXPECT_EQ(adaptive_sampling.align_samples(0, 60), 16);
+
+ /* Similar to above, but start sample is not 0. */
+ EXPECT_EQ(adaptive_sampling.align_samples(9, 8), 7);
+ EXPECT_EQ(adaptive_sampling.align_samples(9, 20), 7);
+ EXPECT_EQ(adaptive_sampling.align_samples(9, 60), 7);
+
+ /* Start sample is past the minimum required samples, but prior to the first filter sample. */
+ EXPECT_EQ(adaptive_sampling.align_samples(12, 6), 4);
+ EXPECT_EQ(adaptive_sampling.align_samples(12, 20), 4);
+ EXPECT_EQ(adaptive_sampling.align_samples(12, 60), 4);
+
+ /* Start sample is the sample which is to be filtered. */
+ EXPECT_EQ(adaptive_sampling.align_samples(15, 4), 1);
+ EXPECT_EQ(adaptive_sampling.align_samples(15, 6), 1);
+ EXPECT_EQ(adaptive_sampling.align_samples(15, 10), 1);
+ EXPECT_EQ(adaptive_sampling.align_samples(58, 2), 2);
+
+ /* Start sample is past the sample which is to be filtered. */
+ EXPECT_EQ(adaptive_sampling.align_samples(16, 3), 3);
+ EXPECT_EQ(adaptive_sampling.align_samples(16, 4), 4);
+ EXPECT_EQ(adaptive_sampling.align_samples(16, 5), 4);
+ EXPECT_EQ(adaptive_sampling.align_samples(16, 10), 4);
+
+ /* Should never exceed requested number of samples. */
+ EXPECT_EQ(adaptive_sampling.align_samples(15, 2), 1);
+ EXPECT_EQ(adaptive_sampling.align_samples(16, 2), 2);
+ EXPECT_EQ(adaptive_sampling.align_samples(17, 2), 2);
+ EXPECT_EQ(adaptive_sampling.align_samples(18, 2), 2);
+}
+
+TEST(AdaptiveSampling, need_filter)
+{
+ AdaptiveSampling adaptive_sampling;
+ adaptive_sampling.use = true;
+ adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+ adaptive_sampling.adaptive_step = 4;
+
+ const vector<int> expected_samples_to_filter = {
+ {15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59}};
+
+ vector<int> actual_samples_to_filter;
+ for (int sample = 0; sample < 60; ++sample) {
+ if (adaptive_sampling.need_filter(sample)) {
+ actual_samples_to_filter.push_back(sample);
+ }
+ }
+
+ EXPECT_EQ(actual_samples_to_filter, expected_samples_to_filter);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_render_scheduler_test.cpp b/intern/cycles/test/integrator_render_scheduler_test.cpp
new file mode 100644
index 00000000000..b4efbc2d1a7
--- /dev/null
+++ b/intern/cycles/test/integrator_render_scheduler_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/render_scheduler.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(IntegratorRenderScheduler, calculate_resolution_divider_for_resolution)
+{
+ EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 1920), 1);
+ EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 960), 2);
+ EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 480), 4);
+}
+
+TEST(IntegratorRenderScheduler, calculate_resolution_for_divider)
+{
+ EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 1), 1440);
+ EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 2), 720);
+ EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 4), 360);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
new file mode 100644
index 00000000000..5bb57b48c3c
--- /dev/null
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/tile.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(tile_calculate_best_size, Basic)
+{
+ /* Make sure CPU-like case is handled properly. */
+ EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1), TileSize(1, 1, 1));
+ EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1), TileSize(1, 1, 1));
+
+ /* Enough path states to fit an entire image with all samples. */
+ EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080),
+ TileSize(1920, 1080, 1));
+ EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100),
+ TileSize(1920, 1080, 100));
+}
+
+TEST(tile_calculate_best_size, Extreme)
+{
+ EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072), TileSize(1, 1, 512));
+ EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072), TileSize(1, 1, 1024));
+ EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072), TileSize(1, 1, 4096));
+
+ EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024),
+ TileSize(1, 1, 1024));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index da9b29314a7..19c211fe5f7 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -181,7 +181,7 @@ class RenderGraph : public testing::Test {
util_logging_start();
util_logging_verbosity_set(1);
- device_cpu = Device::create(device_info, stats, profiler, true);
+ device_cpu = Device::create(device_info, stats, profiler);
scene = new Scene(scene_params, device_cpu);
}
diff --git a/intern/cycles/test/util_math_test.cpp b/intern/cycles/test/util_math_test.cpp
new file mode 100644
index 00000000000..b6ce3ef0cf3
--- /dev/null
+++ b/intern/cycles/test/util_math_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(math, next_power_of_two)
+{
+ EXPECT_EQ(next_power_of_two(0), 1);
+ EXPECT_EQ(next_power_of_two(1), 2);
+ EXPECT_EQ(next_power_of_two(2), 4);
+ EXPECT_EQ(next_power_of_two(3), 4);
+ EXPECT_EQ(next_power_of_two(4), 8);
+}
+
+TEST(math, prev_power_of_two)
+{
+ EXPECT_EQ(prev_power_of_two(0), 0);
+
+ EXPECT_EQ(prev_power_of_two(1), 1);
+ EXPECT_EQ(prev_power_of_two(2), 1);
+
+ EXPECT_EQ(prev_power_of_two(3), 2);
+ EXPECT_EQ(prev_power_of_two(4), 2);
+
+ EXPECT_EQ(prev_power_of_two(5), 4);
+ EXPECT_EQ(prev_power_of_two(6), 4);
+ EXPECT_EQ(prev_power_of_two(7), 4);
+ EXPECT_EQ(prev_power_of_two(8), 4);
+}
+
+TEST(math, reverse_integer_bits)
+{
+ EXPECT_EQ(reverse_integer_bits(0xFFFFFFFF), 0xFFFFFFFF);
+ EXPECT_EQ(reverse_integer_bits(0x00000000), 0x00000000);
+ EXPECT_EQ(reverse_integer_bits(0x1), 0x80000000);
+ EXPECT_EQ(reverse_integer_bits(0x80000000), 0x1);
+ EXPECT_EQ(reverse_integer_bits(0xFFFF0000), 0x0000FFFF);
+ EXPECT_EQ(reverse_integer_bits(0x0000FFFF), 0xFFFF0000);
+ EXPECT_EQ(reverse_integer_bits(0x00FF0000), 0x0000FF00);
+ EXPECT_EQ(reverse_integer_bits(0x0000FF00), 0x00FF0000);
+ EXPECT_EQ(reverse_integer_bits(0xAAAAAAAA), 0x55555555);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 97f8daa65de..c9022d1b132 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -281,4 +281,40 @@ TEST(util_string_remove_trademark, r_space_middle)
EXPECT_EQ(str, "foo bar baz");
}
+/* ******** Tests for string_startswith() ******** */
+
+TEST(string_startswith, basic)
+{
+ EXPECT_TRUE(string_startswith("", ""));
+
+ EXPECT_FALSE(string_startswith("", "World"));
+ EXPECT_TRUE(string_startswith("Hello", ""));
+
+ EXPECT_FALSE(string_startswith("Hello", "World"));
+
+ EXPECT_TRUE(string_startswith("Hello", "Hello"));
+ EXPECT_TRUE(string_startswith("Hello", "He"));
+ EXPECT_TRUE(string_startswith("Hello", "H"));
+
+ EXPECT_FALSE(string_startswith("Hello", "e"));
+ EXPECT_FALSE(string_startswith("Hello", "HelloWorld"));
+}
+
+TEST(string_endswith, basic)
+{
+ EXPECT_TRUE(string_endswith("", ""));
+
+ EXPECT_FALSE(string_endswith("", "World"));
+ EXPECT_TRUE(string_endswith("Hello", ""));
+
+ EXPECT_FALSE(string_endswith("Hello", "World"));
+
+ EXPECT_TRUE(string_endswith("Hello", "Hello"));
+ EXPECT_TRUE(string_endswith("Hello", "lo"));
+ EXPECT_TRUE(string_endswith("Hello", "o"));
+
+ EXPECT_FALSE(string_endswith("Hello", "e"));
+ EXPECT_FALSE(string_endswith("Hello", "WorldHello"));
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 13d177d2b25..de17efafcf2 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,56 +34,6 @@
#else /* __KERNEL_GPU__ */
-# ifdef __KERNEL_OPENCL__
-
-/* Float atomics implementation credits:
- * http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
- */
-ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
- const float operand)
-{
- union {
- unsigned int int_value;
- float float_value;
- } new_value;
- union {
- unsigned int int_value;
- float float_value;
- } prev_value;
- do {
- prev_value.float_value = *source;
- new_value.float_value = prev_value.float_value + operand;
- } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source,
- prev_value.int_value,
- new_value.int_value) != prev_value.int_value);
- return new_value.float_value;
-}
-
-ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
- const float old_val,
- const float new_val)
-{
- union {
- unsigned int int_value;
- float float_value;
- } new_value, prev_value, result;
- prev_value.float_value = old_val;
- new_value.float_value = new_val;
- result.int_value = atomic_cmpxchg(
- (volatile ccl_global unsigned int *)dest, prev_value.int_value, new_value.int_value);
- return result.float_value;
-}
-
-# define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
-# define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
-# define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
-# define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
-
-# define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
-# define ccl_barrier(flags) barrier(flags)
-
-# endif /* __KERNEL_OPENCL__ */
-
# ifdef __KERNEL_CUDA__
# define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 74ecefa1917..1d598725c84 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -26,13 +26,7 @@
CCL_NAMESPACE_BEGIN
DebugFlags::CPU::CPU()
- : avx2(true),
- avx(true),
- sse41(true),
- sse3(true),
- sse2(true),
- bvh_layout(BVH_LAYOUT_AUTO),
- split_kernel(false)
+ : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
{
reset();
}
@@ -58,11 +52,9 @@ void DebugFlags::CPU::reset()
#undef CHECK_CPU_FLAGS
bvh_layout = BVH_LAYOUT_AUTO;
-
- split_kernel = false;
}
-DebugFlags::CUDA::CUDA() : adaptive_compile(false), split_kernel(false)
+DebugFlags::CUDA::CUDA() : adaptive_compile(false)
{
reset();
}
@@ -71,8 +63,6 @@ void DebugFlags::CUDA::reset()
{
if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
-
- split_kernel = false;
}
DebugFlags::OptiX::OptiX()
@@ -82,42 +72,7 @@ DebugFlags::OptiX::OptiX()
void DebugFlags::OptiX::reset()
{
- cuda_streams = 1;
- curves_api = false;
-}
-
-DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
-{
- reset();
-}
-
-void DebugFlags::OpenCL::reset()
-{
- /* Initialize device type from environment variables. */
- device_type = DebugFlags::OpenCL::DEVICE_ALL;
- char *device = getenv("CYCLES_OPENCL_TEST");
- if (device) {
- if (strcmp(device, "NONE") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_NONE;
- }
- else if (strcmp(device, "ALL") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_ALL;
- }
- else if (strcmp(device, "DEFAULT") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
- }
- else if (strcmp(device, "CPU") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_CPU;
- }
- else if (strcmp(device, "GPU") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_GPU;
- }
- else if (strcmp(device, "ACCELERATOR") == 0) {
- device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
- }
- }
- /* Initialize other flags from environment variables. */
- debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+ use_debug = false;
}
DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
@@ -131,7 +86,6 @@ void DebugFlags::reset()
cpu.reset();
cuda.reset();
optix.reset();
- opencl.reset();
}
std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
@@ -142,40 +96,13 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
<< " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
<< " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
- << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
- << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
+ << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n";
os << "CUDA flags:\n"
<< " Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
os << "OptiX flags:\n"
- << " CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
-
- const char *opencl_device_type;
- switch (debug_flags.opencl.device_type) {
- case DebugFlags::OpenCL::DEVICE_NONE:
- opencl_device_type = "NONE";
- break;
- case DebugFlags::OpenCL::DEVICE_ALL:
- opencl_device_type = "ALL";
- break;
- case DebugFlags::OpenCL::DEVICE_DEFAULT:
- opencl_device_type = "DEFAULT";
- break;
- case DebugFlags::OpenCL::DEVICE_CPU:
- opencl_device_type = "CPU";
- break;
- case DebugFlags::OpenCL::DEVICE_GPU:
- opencl_device_type = "GPU";
- break;
- case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
- opencl_device_type = "ACCELERATOR";
- break;
- }
- os << "OpenCL flags:\n"
- << " Device type : " << opencl_device_type << "\n"
- << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n"
- << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
+ << " Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
return os;
}
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index f7e53f90f74..99e2723180c 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -79,9 +79,6 @@ class DebugFlags {
* CPUs and GPUs can be selected here instead.
*/
BVHLayout bvh_layout;
-
- /* Whether split kernel is used */
- bool split_kernel;
};
/* Descriptor of CUDA feature-set to be used. */
@@ -94,9 +91,6 @@ class DebugFlags {
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux atm. */
bool adaptive_compile;
-
- /* Whether split kernel is used */
- bool split_kernel;
};
/* Descriptor of OptiX feature-set to be used. */
@@ -106,61 +100,9 @@ class DebugFlags {
/* Reset flags to their defaults. */
void reset();
- /* Number of CUDA streams to launch kernels concurrently from. */
- int cuda_streams;
-
- /* Use OptiX curves API for hair instead of custom implementation. */
- bool curves_api;
- };
-
- /* Descriptor of OpenCL feature-set to be used. */
- struct OpenCL {
- OpenCL();
-
- /* Reset flags to their defaults. */
- void reset();
-
- /* Available device types.
- * Only gives a hint which devices to let user to choose from, does not
- * try to use any sort of optimal device or so.
- */
- enum DeviceType {
- /* None of OpenCL devices will be used. */
- DEVICE_NONE,
- /* All OpenCL devices will be used. */
- DEVICE_ALL,
- /* Default system OpenCL device will be used. */
- DEVICE_DEFAULT,
- /* Host processor will be used. */
- DEVICE_CPU,
- /* GPU devices will be used. */
- DEVICE_GPU,
- /* Dedicated OpenCL accelerator device will be used. */
- DEVICE_ACCELERATOR,
- };
-
- /* Available kernel types. */
- enum KernelType {
- /* Do automated guess which kernel to use, based on the officially
- * supported GPUs and such.
- */
- KERNEL_DEFAULT,
- /* Force mega kernel to be used. */
- KERNEL_MEGA,
- /* Force split kernel to be used. */
- KERNEL_SPLIT,
- };
-
- /* Requested device type. */
- DeviceType device_type;
-
- /* Use debug version of the kernel. */
- bool debug;
-
- /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all
- * devices. */
- /* Artificial memory limit in bytes (0 if disabled). */
- size_t mem_limit;
+ /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
+ * validations, and lower optimization level. */
+ bool use_debug;
};
/* Get instance of debug flags registry. */
@@ -182,9 +124,6 @@ class DebugFlags {
/* Requested OptiX flags. */
OptiX optix;
- /* Requested OpenCL flags. */
- OpenCL opencl;
-
private:
DebugFlags();
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 0a239a944a5..9b1698d461a 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -43,9 +43,9 @@
# define ccl_local_param
# define ccl_private
# define ccl_restrict __restrict
-# define ccl_ref &
# define ccl_optional_struct_init
# define ccl_loop_no_unroll
+# define ccl_attr_maybe_unused [[maybe_unused]]
# define __KERNEL_WITH_SSE_ALIGN__
# if defined(_WIN32) && !defined(FREE_WINDOWS)
@@ -62,7 +62,6 @@
# define ccl_may_alias
# define ccl_always_inline __forceinline
# define ccl_never_inline __declspec(noinline)
-# define ccl_maybe_unused
# else /* _WIN32 && !FREE_WINDOWS */
# define ccl_device_inline static inline __attribute__((always_inline))
# define ccl_device_forceinline static inline __attribute__((always_inline))
@@ -74,7 +73,6 @@
# define ccl_may_alias __attribute__((__may_alias__))
# define ccl_always_inline __attribute__((always_inline))
# define ccl_never_inline __attribute__((noinline))
-# define ccl_maybe_unused __attribute__((used))
# endif /* _WIN32 && !FREE_WINDOWS */
/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index a8d4ee75e20..d9edfec5da3 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN
/* Half Floats */
-#ifdef __KERNEL_OPENCL__
-
-# define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h);
-
-#else
-
/* CUDA has its own half data type, no need to define then */
-# ifndef __KERNEL_CUDA__
+#ifndef __KERNEL_CUDA__
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
* unsigned shorts. */
class half {
@@ -59,27 +53,27 @@ class half {
private:
unsigned short v;
};
-# endif
+#endif
struct half4 {
half x, y, z, w;
};
-# ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_CUDA__
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
{
- h[0] = __float2half(f.x * scale);
- h[1] = __float2half(f.y * scale);
- h[2] = __float2half(f.z * scale);
- h[3] = __float2half(f.w * scale);
+ h[0] = __float2half(f.x);
+ h[1] = __float2half(f.y);
+ h[2] = __float2half(f.z);
+ h[3] = __float2half(f.w);
}
-# else
+#else
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
{
-# ifndef __KERNEL_SSE2__
+# ifndef __KERNEL_SSE2__
for (int i = 0; i < 4; i++) {
/* optimized float to half for pixels:
* assumes no negative, no nan, no inf, and sets denormal to 0 */
@@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
uint i;
float f;
} in;
- float fscale = f[i] * scale;
- in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f;
+ in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
int x = in.i;
int absolute = x & 0x7FFFFFFF;
@@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
h[i] = (rshift & 0x7FFF);
}
-# else
+# else
/* same as above with SSE */
- ssef fscale = load4f(f) * scale;
- ssef x = min(max(fscale, 0.0f), 65504.0f);
+ ssef x = min(max(load4f(f), 0.0f), 65504.0f);
-# ifdef __KERNEL_AVX2__
+# ifdef __KERNEL_AVX2__
ssei rpack = _mm_cvtps_ph(x, 0);
-# else
+# else
ssei absolute = cast(x) & 0x7FFFFFFF;
ssei Z = absolute + 0xC8000000;
ssei result = andnot(absolute < 0x38800000, Z);
ssei rshift = (result >> 13) & 0x7FFF;
ssei rpack = _mm_packs_epi32(rshift, rshift);
-# endif
+# endif
_mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
-# endif
+# endif
}
ccl_device_inline float half_to_float(half h)
@@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f)
return (value_bits | sign_bit);
}
-# endif
-
#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index c161299acd0..35c2d436d09 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
# define LOG(severity) LOG_SUPPRESS()
# define VLOG(severity) LOG_SUPPRESS()
# define VLOG_IF(severity, condition) LOG_SUPPRESS()
+# define VLOG_IS_ON(severity) false
# define CHECK(expression) LOG_SUPPRESS()
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c5996ebfcb6..6d728dde679 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,11 +26,9 @@
# include <cmath>
#endif
-#ifndef __KERNEL_OPENCL__
-# include <float.h>
-# include <math.h>
-# include <stdio.h>
-#endif /* __KERNEL_OPENCL__ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
#include "util/util_types.h"
@@ -86,7 +84,6 @@ CCL_NAMESPACE_BEGIN
/* Scalar */
#ifdef _WIN32
-# ifndef __KERNEL_OPENCL__
ccl_device_inline float fmaxf(float a, float b)
{
return (a > b) ? a : b;
@@ -96,8 +93,7 @@ ccl_device_inline float fminf(float a, float b)
{
return (a < b) ? a : b;
}
-# endif /* !__KERNEL_OPENCL__ */
-#endif /* _WIN32 */
+#endif /* _WIN32 */
#ifndef __KERNEL_GPU__
using std::isfinite;
@@ -119,6 +115,11 @@ ccl_device_inline int min(int a, int b)
return (a < b) ? a : b;
}
+ccl_device_inline uint min(uint a, uint b)
+{
+ return (a < b) ? a : b;
+}
+
ccl_device_inline float max(float a, float b)
{
return (a > b) ? a : b;
@@ -166,7 +167,6 @@ ccl_device_inline float max4(float a, float b, float c, float d)
return max(max(a, b), max(c, d));
}
-#ifndef __KERNEL_OPENCL__
/* Int/Float conversion */
ccl_device_inline int as_int(uint i)
@@ -241,24 +241,23 @@ ccl_device_inline float __uint_as_float(uint i)
ccl_device_inline int4 __float4_as_int4(float4 f)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(f.m128));
-# else
+#else
return make_int4(
__float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-# endif
+#endif
}
ccl_device_inline float4 __int4_as_float4(int4 i)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_castsi128_ps(i.m128));
-# else
+#else
return make_float4(
__int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-# endif
+#endif
}
-#endif /* __KERNEL_OPENCL__ */
/* Versions of functions which are safe for fast math. */
ccl_device_inline bool isnan_safe(float f)
@@ -279,7 +278,6 @@ ccl_device_inline float ensure_finite(float v)
return isfinite_safe(v) ? v : 0.0f;
}
-#ifndef __KERNEL_OPENCL__
ccl_device_inline int clamp(int a, int mn, int mx)
{
return min(max(a, mn), mx);
@@ -309,8 +307,6 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
return result;
}
-#endif /* __KERNEL_OPENCL__ */
-
#ifndef __KERNEL_CUDA__
ccl_device_inline float saturate(float a)
{
@@ -451,7 +447,6 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN
-#ifndef __KERNEL_OPENCL__
/* Interpolation */
template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -459,15 +454,9 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
return (A)(a * ((B)1 - t) + b * t);
}
-#endif /* __KERNEL_OPENCL__ */
-
/* Triangle */
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3)
-#else
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
-#endif
{
return len(cross(v3 - v2, v1 - v2)) * 0.5f;
}
@@ -665,11 +654,7 @@ ccl_device_inline float pow22(float a)
ccl_device_inline float beta(float x, float y)
{
-#ifndef __KERNEL_OPENCL__
return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
-#else
- return expf(lgamma(x) + lgamma(y) - lgamma(x + y));
-#endif
}
ccl_device_inline float xor_signmask(float x, int y)
@@ -686,8 +671,6 @@ ccl_device_inline uint count_leading_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
return __clz(x);
-#elif defined(__KERNEL_OPENCL__)
- return clz(x);
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -704,8 +687,6 @@ ccl_device_inline uint count_trailing_zeros(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
return (__ffs(x) - 1);
-#elif defined(__KERNEL_OPENCL__)
- return (31 - count_leading_zeros(x & -x));
#else
assert(x != 0);
# ifdef _MSC_VER
@@ -722,8 +703,6 @@ ccl_device_inline uint find_first_set(uint x)
{
#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
return __ffs(x);
-#elif defined(__KERNEL_OPENCL__)
- return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
#else
# ifdef _MSC_VER
return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -797,6 +776,52 @@ ccl_device_inline float precise_angle(float3 a, float3 b)
return 2.0f * atan2f(len(a - b), len(a + b));
}
+/* Return value which is greater than the given one and is a power of two. */
+ccl_device_inline uint next_power_of_two(uint x)
+{
+ return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
+}
+
+/* Return value which is lower than the given one and is a power of two. */
+ccl_device_inline uint prev_power_of_two(uint x)
+{
+ return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
+}
+
+#ifndef __has_builtin
+# define __has_builtin(v) 0
+#endif
+
+/* Reverses the bits of a 32 bit integer. */
+ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
+{
+ /* Use a native instruction if it exists. */
+#if defined(__arm__) || defined(__aarch64__)
+ __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
+ return x;
+#elif defined(__KERNEL_CUDA__)
+ return __brev(x);
+#elif __has_builtin(__builtin_bitreverse32)
+ return __builtin_bitreverse32(x);
+#else
+ /* Flip pairwise. */
+ x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
+ /* Flip pairs. */
+ x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
+ /* Flip nibbles. */
+ x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
+ /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
+# ifdef _MSC_VER
+ return _byteswap_ulong(x);
+# elif defined(__INTEL_COMPILER)
+ return (uint32_t)_bswap((int)x);
+# else
+ /* Assuming gcc or clang. */
+ return __builtin_bswap32(x);
+# endif
+#endif
+}
+
CCL_NAMESPACE_END
#endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 17f6f3c9382..70b80c33544 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float2 operator-(const float2 &a);
ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -64,7 +63,6 @@ ccl_device_inline float2 fabs(const float2 &a);
ccl_device_inline float2 as_float2(const float4 &a);
ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_OPENCL__ */
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
@@ -82,7 +80,6 @@ ccl_device_inline float2 one_float2()
return make_float2(1.0f, 1.0f);
}
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float2 operator-(const float2 &a)
{
return make_float2(-a.x, -a.y);
@@ -262,8 +259,6 @@ ccl_device_inline float2 floor(const float2 &a)
return make_float2(floorf(a.x), floorf(a.y));
}
-#endif /* !__KERNEL_OPENCL__ */
-
ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
{
return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 9673c043189..30a1b4c3f77 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float3 operator-(const float3 &a);
ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -63,7 +62,6 @@ ccl_device_inline float3 rcp(const float3 &a);
ccl_device_inline float3 sqrt(const float3 &a);
ccl_device_inline float3 floor(const float3 &a);
ccl_device_inline float3 ceil(const float3 &a);
-#endif /* !__KERNEL_OPENCL__ */
ccl_device_inline float min3(float3 a);
ccl_device_inline float max3(float3 a);
@@ -105,50 +103,49 @@ ccl_device_inline float3 one_float3()
return make_float3(1.0f, 1.0f, 1.0f);
}
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float3 operator-(const float3 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-# else
+#else
return make_float3(-a.x, -a.y, -a.z);
-# endif
+#endif
}
ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, b.m128));
-# else
+#else
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-# endif
+#endif
}
ccl_device_inline float3 operator*(const float3 &a, const float f)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-# else
+#else
return make_float3(a.x * f, a.y * f, a.z * f);
-# endif
+#endif
}
ccl_device_inline float3 operator*(const float f, const float3 &a)
{
-# if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-# else
+#else
return make_float3(a.x * f, a.y * f, a.z * f);
-# endif
+#endif
}
ccl_device_inline float3 operator/(const float f, const float3 &a)
{
-# if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-# else
+#else
return make_float3(f / a.x, f / a.y, f / a.z);
-# endif
+#endif
}
ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -159,11 +156,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
{
-# if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
return float3(_mm_div_ps(a.m128, b.m128));
-# else
+#else
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-# endif
+#endif
}
ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -173,11 +170,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_add_ps(a.m128, b.m128));
-# else
+#else
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-# endif
+#endif
}
ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -187,11 +184,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_sub_ps(a.m128, b.m128));
-# else
+#else
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-# endif
+#endif
}
ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -227,11 +224,11 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
ccl_device_inline bool operator==(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-# else
+#else
return (a.x == b.x && a.y == b.y && a.z == b.z);
-# endif
+#endif
}
ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -246,20 +243,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
ccl_device_inline float dot(const float3 &a, const float3 &b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-# else
+#else
return a.x * b.x + a.y * b.y + a.z * b.z;
-# endif
+#endif
}
ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-# else
+#else
return a.x * b.x + a.y * b.y;
-# endif
+#endif
}
ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -270,30 +267,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
ccl_device_inline float3 normalize(const float3 &a)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
return float3(_mm_div_ps(a.m128, norm));
-# else
+#else
return a / len(a);
-# endif
+#endif
}
ccl_device_inline float3 min(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_min_ps(a.m128, b.m128));
-# else
+#else
return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-# endif
+#endif
}
ccl_device_inline float3 max(const float3 &a, const float3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_max_ps(a.m128, b.m128));
-# else
+#else
return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-# endif
+#endif
}
ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -303,43 +300,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
ccl_device_inline float3 fabs(const float3 &a)
{
-# ifdef __KERNEL_SSE__
-# ifdef __KERNEL_NEON__
+#ifdef __KERNEL_SSE__
+# ifdef __KERNEL_NEON__
return float3(vabsq_f32(a.m128));
-# else
+# else
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return float3(_mm_and_ps(a.m128, mask));
-# endif
-# else
- return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
# endif
+#else
+ return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
}
ccl_device_inline float3 sqrt(const float3 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_sqrt_ps(a));
-# else
+#else
return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-# endif
+#endif
}
ccl_device_inline float3 floor(const float3 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_floor_ps(a));
-# else
+#else
return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-# endif
+#endif
}
ccl_device_inline float3 ceil(const float3 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float3(_mm_ceil_ps(a));
-# else
+#else
return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-# endif
+#endif
}
ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -349,14 +346,13 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
ccl_device_inline float3 rcp(const float3 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
/* Don't use _mm_rcp_ps due to poor precision. */
return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-# else
+#else
return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-# endif
+#endif
}
-#endif /* !__KERNEL_OPENCL__ */
ccl_device_inline float min3(float3 a)
{
@@ -483,11 +479,7 @@ ccl_device_inline float average(const float3 a)
ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
{
-#ifdef __KERNEL_OPENCL__
- return all(a == b);
-#else
return a == b;
-#endif
}
ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 0ba2bafa2f0..19af5c8c638 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float4 operator-(const float4 &a);
ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -66,7 +65,6 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a);
ccl_device_inline float4 floor(const float4 &a);
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_OPENCL__*/
ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
@@ -112,33 +110,32 @@ ccl_device_inline float4 one_float4()
return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
}
-#ifndef __KERNEL_OPENCL__
ccl_device_inline float4 operator-(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return float4(_mm_xor_ps(a.m128, mask));
-# else
+#else
return make_float4(-a.x, -a.y, -a.z, -a.w);
-# endif
+#endif
}
ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_mul_ps(a.m128, b.m128));
-# else
+#else
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-# endif
+#endif
}
ccl_device_inline float4 operator*(const float4 &a, float f)
{
-# if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
return a * make_float4(f);
-# else
+#else
return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-# endif
+#endif
}
ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -153,11 +150,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_div_ps(a.m128, b.m128));
-# else
+#else
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-# endif
+#endif
}
ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -167,11 +164,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_add_ps(a.m128, b.m128));
-# else
+#else
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-# endif
+#endif
}
ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -181,11 +178,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_sub_ps(a.m128, b.m128));
-# else
+#else
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-# endif
+#endif
}
ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -215,38 +212,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-# else
+#else
return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-# endif
+#endif
}
ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-# else
+#else
return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-# endif
+#endif
}
ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-# else
+#else
return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-# endif
+#endif
}
ccl_device_inline bool operator==(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-# else
+#else
return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-# endif
+#endif
}
ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -256,16 +253,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
{
-# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
__m128 t = vmulq_f32(a, b);
return vaddvq_f32(t);
-# else
- return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-# endif
# else
- return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+ return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
# endif
+#else
+ return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#endif
}
ccl_device_inline float len_squared(const float4 &a)
@@ -275,21 +272,21 @@ ccl_device_inline float len_squared(const float4 &a)
ccl_device_inline float4 rcp(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
/* Don't use _mm_rcp_ps due to poor precision. */
return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-# else
+#else
return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-# endif
+#endif
}
ccl_device_inline float4 sqrt(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_sqrt_ps(a.m128));
-# else
+#else
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-# endif
+#endif
}
ccl_device_inline float4 sqr(const float4 &a)
@@ -299,39 +296,39 @@ ccl_device_inline float4 sqr(const float4 &a)
ccl_device_inline float4 cross(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
(shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-# else
+#else
return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-# endif
+#endif
}
ccl_device_inline bool is_zero(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return a == make_float4(0.0f);
-# else
+#else
return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-# endif
+#endif
}
ccl_device_inline float4 reduce_add(const float4 &a)
{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
return float4(vdupq_n_f32(vaddvq_f32(a)));
-# elif defined(__KERNEL_SSE3__)
+# elif defined(__KERNEL_SSE3__)
float4 h(_mm_hadd_ps(a.m128, a.m128));
return float4(_mm_hadd_ps(h.m128, h.m128));
-# else
+# else
float4 h(shuffle<1, 0, 3, 2>(a) + a);
return shuffle<2, 3, 0, 1>(h) + h;
-# endif
-# else
+# endif
+#else
float sum = (a.x + a.y) + (a.z + a.w);
return make_float4(sum, sum, sum, sum);
-# endif
+#endif
}
ccl_device_inline float average(const float4 &a)
@@ -357,20 +354,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
ccl_device_inline float4 min(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_min_ps(a.m128, b.m128));
-# else
+#else
return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-# endif
+#endif
}
ccl_device_inline float4 max(const float4 &a, const float4 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_max_ps(a.m128, b.m128));
-# else
+#else
return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-# endif
+#endif
}
ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -380,24 +377,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
ccl_device_inline float4 fabs(const float4 &a)
{
-# if defined(__KERNEL_SSE__)
-# if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+# if defined(__KERNEL_NEON__)
return float4(vabsq_f32(a));
-# else
- return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-# endif
# else
- return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+ return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
# endif
+#else
+ return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#endif
}
ccl_device_inline float4 floor(const float4 &a)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return float4(_mm_floor_ps(a));
-# else
+#else
return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-# endif
+#endif
}
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -405,8 +402,6 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
return a + t * (b - a);
}
-#endif /* !__KERNEL_OPENCL__*/
-
#ifdef __KERNEL_SSE__
template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
__forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
index 0295cd51f7e..5782b878801 100644
--- a/intern/cycles/util/util_math_int2.h
+++ b/intern/cycles/util/util_math_int2.h
@@ -27,20 +27,17 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline bool operator==(const int2 a, const int2 b);
ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_OPENCL__ */
/*******************************************************************************
* Definition.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline bool operator==(const int2 a, const int2 b)
{
return (a.x == b.x && a.y == b.y);
@@ -70,7 +67,6 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
{
return make_int2(a.x / b.x, a.y / b.y);
}
-#endif /* !__KERNEL_OPENCL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
index d92ed895dc2..e0dfae7c015 100644
--- a/intern/cycles/util/util_math_int3.h
+++ b/intern/cycles/util/util_math_int3.h
@@ -27,52 +27,49 @@ CCL_NAMESPACE_BEGIN
* Declaration.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline int3 min(int3 a, int3 b);
ccl_device_inline int3 max(int3 a, int3 b);
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /* !__KERNEL_OPENCL__ */
/*******************************************************************************
* Definition.
*/
-#ifndef __KERNEL_OPENCL__
ccl_device_inline int3 min(int3 a, int3 b)
{
-# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int3(_mm_min_epi32(a.m128, b.m128));
-# else
+#else
return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-# endif
+#endif
}
ccl_device_inline int3 max(int3 a, int3 b)
{
-# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
return int3(_mm_max_epi32(a.m128, b.m128));
-# else
+#else
return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-# endif
+#endif
}
ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return min(max(a, make_int3(mn)), make_int3(mx));
-# else
+#else
return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-# endif
+#endif
}
ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return min(max(a, mn), make_int3(mx));
-# else
+#else
return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-# endif
+#endif
}
ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -92,22 +89,21 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int3(_mm_add_epi32(a.m128, b.m128));
-# else
+#else
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-# endif
+#endif
}
ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
{
-# ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
return int3(_mm_sub_epi32(a.m128, b.m128));
-# else
+#else
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-# endif
+#endif
}
-#endif /* !__KERNEL_OPENCL__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 8905c8bc7f0..c78f4615013 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -66,6 +66,7 @@ typedef struct stat path_stat_t;
static string cached_path = "";
static string cached_user_path = "";
+static string cached_temp_path = "";
static string cached_xdg_cache_path = "";
namespace {
@@ -335,10 +336,11 @@ static string path_xdg_cache_get()
}
#endif
-void path_init(const string &path, const string &user_path)
+void path_init(const string &path, const string &user_path, const string &temp_path)
{
cached_path = path;
cached_user_path = user_path;
+ cached_temp_path = temp_path;
#ifdef _MSC_VER
// workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -382,6 +384,15 @@ string path_cache_get(const string &sub)
#endif
}
+string path_temp_get(const string &sub)
+{
+ if (cached_temp_path == "") {
+ cached_temp_path = Filesystem::temp_directory_path();
+ }
+
+ return path_join(cached_temp_path, sub);
+}
+
#if defined(__linux__) || defined(__APPLE__)
string path_xdg_home_get(const string &sub = "");
#endif
@@ -739,177 +750,6 @@ bool path_remove(const string &path)
return remove(path.c_str()) == 0;
}
-struct SourceReplaceState {
- typedef map<string, string> ProcessedMapping;
- /* Base director for all relative include headers. */
- string base;
- /* Result of processed files. */
- ProcessedMapping processed_files;
- /* Set of files which are considered "precompiled" and which are replaced
- * with and empty string on a subsequent occurrence in include statement.
- */
- set<string> precompiled_headers;
-};
-
-static string path_source_replace_includes_recursive(const string &source,
- const string &source_filepath,
- SourceReplaceState *state);
-
-static string line_directive(const SourceReplaceState &state, const string &path, const int line)
-{
- string unescaped_path = path;
- /* First we make path relative. */
- if (string_startswith(unescaped_path, state.base.c_str())) {
- const string base_file = path_filename(state.base);
- const size_t base_len = state.base.length();
- unescaped_path = base_file +
- unescaped_path.substr(base_len, unescaped_path.length() - base_len);
- }
- /* Second, we replace all unsafe characters. */
- const size_t length = unescaped_path.length();
- string escaped_path = "";
- for (size_t i = 0; i < length; ++i) {
- const char ch = unescaped_path[i];
- if (strchr("\"\'\?\\", ch) != NULL) {
- escaped_path += "\\";
- }
- escaped_path += ch;
- }
- /* TODO(sergey): Check whether using std::to_string combined with several
- * concatenation operations is any faster.
- */
- return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
-}
-
-static string path_source_handle_preprocessor(const string &preprocessor_line,
- const string &source_filepath,
- const size_t line_number,
- SourceReplaceState *state)
-{
- string result = preprocessor_line;
- string token = string_strip(preprocessor_line.substr(1, preprocessor_line.size() - 1));
- if (string_startswith(token, "include")) {
- token = string_strip(token.substr(7, token.size() - 7));
- if (token[0] == '"') {
- const size_t n_start = 1;
- const size_t n_end = token.find("\"", n_start);
- const string filename = token.substr(n_start, n_end - n_start);
- const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
- string filepath = path_join(state->base, filename);
- if (!path_exists(filepath)) {
- filepath = path_join(path_dirname(source_filepath), filename);
- }
- if (is_precompiled) {
- state->precompiled_headers.insert(filepath);
- }
- string text;
- if (path_read_text(filepath, text)) {
- text = path_source_replace_includes_recursive(text, filepath, state);
- /* Use line directives for better error messages. */
- result = line_directive(*state, filepath, 1) + "\n" + text + "\n" +
- line_directive(*state, source_filepath, line_number + 1);
- }
- }
- }
- return result;
-}
-
-/* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of OpenCL drivers not supporting
- * include paths with spaces in them.
- */
-static string path_source_replace_includes_recursive(const string &source,
- const string &source_filepath,
- SourceReplaceState *state)
-{
- /* Try to re-use processed file without spending time on replacing all
- * include directives again.
- */
- SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
- source_filepath);
- if (replaced_file != state->processed_files.end()) {
- if (state->precompiled_headers.find(source_filepath) != state->precompiled_headers.end()) {
- return "";
- }
- return replaced_file->second;
- }
- /* Perform full file processing. */
- string result = "";
- const size_t source_length = source.length();
- size_t index = 0;
- /* Information about where we are in the source. */
- size_t line_number = 0, column_number = 1;
- /* Currently gathered non-preprocessor token.
- * Store as start/length rather than token itself to avoid overhead of
- * memory re-allocations on each character concatenation.
- */
- size_t token_start = 0, token_length = 0;
- /* Denotes whether we're inside of preprocessor line, together with
- * preprocessor line itself.
- *
- * TODO(sergey): Investigate whether using token start/end position
- * gives measurable speedup.
- */
- bool inside_preprocessor = false;
- string preprocessor_line = "";
- /* Actual loop over the whole source. */
- while (index < source_length) {
- const char ch = source[index];
- if (ch == '\n') {
- if (inside_preprocessor) {
- result += path_source_handle_preprocessor(
- preprocessor_line, source_filepath, line_number, state);
- /* Start gathering net part of the token. */
- token_start = index;
- token_length = 0;
- }
- inside_preprocessor = false;
- preprocessor_line = "";
- column_number = 0;
- ++line_number;
- }
- else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
- /* Append all possible non-preprocessor token to the result. */
- if (token_length != 0) {
- result.append(source, token_start, token_length);
- token_start = index;
- token_length = 0;
- }
- inside_preprocessor = true;
- }
- if (inside_preprocessor) {
- preprocessor_line += ch;
- }
- else {
- ++token_length;
- }
- ++index;
- ++column_number;
- }
- /* Append possible tokens which happened before special events handled
- * above.
- */
- if (token_length != 0) {
- result.append(source, token_start, token_length);
- }
- if (inside_preprocessor) {
- result += path_source_handle_preprocessor(
- preprocessor_line, source_filepath, line_number, state);
- }
- /* Store result for further reuse. */
- state->processed_files[source_filepath] = result;
- return result;
-}
-
-string path_source_replace_includes(const string &source,
- const string &path,
- const string &source_filename)
-{
- SourceReplaceState state;
- state.base = path;
- return path_source_replace_includes_recursive(source, path_join(path, source_filename), &state);
-}
-
FILE *path_fopen(const string &path, const string &mode)
{
#ifdef _WIN32
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 7a83c2135a4..f899bc2e01c 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -32,9 +32,10 @@
CCL_NAMESPACE_BEGIN
/* program paths */
-void path_init(const string &path = "", const string &user_path = "");
+void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = "");
string path_get(const string &sub = "");
string path_user_get(const string &sub = "");
+string path_temp_get(const string &sub = "");
string path_cache_get(const string &sub = "");
/* path string manipulation */
@@ -65,11 +66,6 @@ bool path_read_text(const string &path, string &text);
/* File manipulation. */
bool path_remove(const string &path);
-/* source code utility */
-string path_source_replace_includes(const string &source,
- const string &path,
- const string &source_filename = "");
-
/* cache utility */
void path_cache_clear_except(const string &name, const set<string> &except);
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index 073b09f719f..5343f076e22 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -48,13 +48,7 @@ void Profiler::run()
}
if (cur_shader >= 0 && cur_shader < shader_samples.size()) {
- /* Only consider the active shader during events whose runtime significantly depends on it.
- */
- if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) ||
- ((cur_event >= PROFILING_CLOSURE_EVAL) &&
- (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
- shader_samples[cur_shader]++;
- }
+ shader_samples[cur_shader]++;
}
if (cur_object >= 0 && cur_object < object_samples.size()) {
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index ceec08ed894..96bb682c50e 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -28,38 +28,30 @@ CCL_NAMESPACE_BEGIN
enum ProfilingEvent : uint32_t {
PROFILING_UNKNOWN,
PROFILING_RAY_SETUP,
- PROFILING_PATH_INTEGRATE,
- PROFILING_SCENE_INTERSECT,
- PROFILING_INDIRECT_EMISSION,
- PROFILING_VOLUME,
- PROFILING_SHADER_SETUP,
- PROFILING_SHADER_EVAL,
- PROFILING_SHADER_APPLY,
- PROFILING_AO,
- PROFILING_SUBSURFACE,
- PROFILING_CONNECT_LIGHT,
- PROFILING_SURFACE_BOUNCE,
- PROFILING_WRITE_RESULT,
-
- PROFILING_INTERSECT,
- PROFILING_INTERSECT_LOCAL,
- PROFILING_INTERSECT_SHADOW_ALL,
- PROFILING_INTERSECT_VOLUME,
- PROFILING_INTERSECT_VOLUME_ALL,
-
- PROFILING_CLOSURE_EVAL,
- PROFILING_CLOSURE_SAMPLE,
- PROFILING_CLOSURE_VOLUME_EVAL,
- PROFILING_CLOSURE_VOLUME_SAMPLE,
-
- PROFILING_DENOISING,
- PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
- PROFILING_DENOISING_RECONSTRUCT,
- PROFILING_DENOISING_DIVIDE_SHADOW,
- PROFILING_DENOISING_NON_LOCAL_MEANS,
- PROFILING_DENOISING_COMBINE_HALVES,
- PROFILING_DENOISING_GET_FEATURE,
- PROFILING_DENOISING_DETECT_OUTLIERS,
+
+ PROFILING_INTERSECT_CLOSEST,
+ PROFILING_INTERSECT_SUBSURFACE,
+ PROFILING_INTERSECT_SHADOW,
+ PROFILING_INTERSECT_VOLUME_STACK,
+
+ PROFILING_SHADE_SURFACE_SETUP,
+ PROFILING_SHADE_SURFACE_EVAL,
+ PROFILING_SHADE_SURFACE_DIRECT_LIGHT,
+ PROFILING_SHADE_SURFACE_INDIRECT_LIGHT,
+ PROFILING_SHADE_SURFACE_AO,
+ PROFILING_SHADE_SURFACE_PASSES,
+
+ PROFILING_SHADE_VOLUME_SETUP,
+ PROFILING_SHADE_VOLUME_INTEGRATE,
+ PROFILING_SHADE_VOLUME_DIRECT_LIGHT,
+ PROFILING_SHADE_VOLUME_INDIRECT_LIGHT,
+
+ PROFILING_SHADE_SHADOW_SETUP,
+ PROFILING_SHADE_SHADOW_SURFACE,
+ PROFILING_SHADE_SHADOW_VOLUME,
+
+ PROFILING_SHADE_LIGHT_SETUP,
+ PROFILING_SHADE_LIGHT_EVAL,
PROFILING_NUM_EVENTS,
};
@@ -136,37 +128,51 @@ class ProfilingHelper {
state->event = event;
}
+ ~ProfilingHelper()
+ {
+ state->event = previous_event;
+ }
+
inline void set_event(ProfilingEvent event)
{
state->event = event;
}
- inline void set_shader(int shader)
+ protected:
+ ProfilingState *state;
+ uint32_t previous_event;
+};
+
+class ProfilingWithShaderHelper : public ProfilingHelper {
+ public:
+ ProfilingWithShaderHelper(ProfilingState *state, ProfilingEvent event)
+ : ProfilingHelper(state, event)
{
- state->shader = shader;
- if (state->active) {
- assert(shader < state->shader_hits.size());
- state->shader_hits[shader]++;
- }
}
- inline void set_object(int object)
+ ~ProfilingWithShaderHelper()
{
- state->object = object;
- if (state->active) {
- assert(object < state->object_hits.size());
- state->object_hits[object]++;
- }
+ state->object = -1;
+ state->shader = -1;
}
- ~ProfilingHelper()
+ inline void set_shader(int object, int shader)
{
- state->event = previous_event;
+ if (state->active) {
+ state->shader = shader;
+ state->object = object;
+
+ if (shader >= 0) {
+ assert(shader < state->shader_hits.size());
+ state->shader_hits[shader]++;
+ }
+
+ if (object >= 0) {
+ assert(object < state->object_hits.size());
+ state->object_hits[object]++;
+ }
+ }
}
-
- private:
- ProfilingState *state;
- uint32_t previous_event;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 26534a29dfe..dca8d3d0ab5 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -46,7 +46,6 @@ class Progress {
substatus = "";
sync_status = "";
sync_substatus = "";
- kernel_status = "";
update_cb = function_null;
cancel = false;
cancel_message = "";
@@ -87,7 +86,6 @@ class Progress {
substatus = "";
sync_status = "";
sync_substatus = "";
- kernel_status = "";
cancel = false;
cancel_message = "";
error = false;
@@ -316,24 +314,6 @@ class Progress {
}
}
- /* kernel status */
-
- void set_kernel_status(const string &kernel_status_)
- {
- {
- thread_scoped_lock lock(progress_mutex);
- kernel_status = kernel_status_;
- }
-
- set_update();
- }
-
- void get_kernel_status(string &kernel_status_)
- {
- thread_scoped_lock lock(progress_mutex);
- kernel_status_ = kernel_status;
- }
-
/* callback */
void set_update()
@@ -378,8 +358,6 @@ class Progress {
string sync_status;
string sync_substatus;
- string kernel_status;
-
volatile bool cancel;
string cancel_message;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8e8caa98a1b..b4a153c329f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -61,14 +61,14 @@ static struct TrueTy {
{
return true;
}
-} True ccl_maybe_unused;
+} True ccl_attr_maybe_unused;
static struct FalseTy {
__forceinline operator bool() const
{
return false;
}
-} False ccl_maybe_unused;
+} False ccl_attr_maybe_unused;
static struct ZeroTy {
__forceinline operator float() const
@@ -79,7 +79,7 @@ static struct ZeroTy {
{
return 0;
}
-} zero ccl_maybe_unused;
+} zero ccl_attr_maybe_unused;
static struct OneTy {
__forceinline operator float() const
@@ -90,7 +90,7 @@ static struct OneTy {
{
return 1;
}
-} one ccl_maybe_unused;
+} one ccl_attr_maybe_unused;
static struct NegInfTy {
__forceinline operator float() const
@@ -101,7 +101,7 @@ static struct NegInfTy {
{
return std::numeric_limits<int>::min();
}
-} neg_inf ccl_maybe_unused;
+} neg_inf ccl_attr_maybe_unused;
static struct PosInfTy {
__forceinline operator float() const
@@ -112,10 +112,10 @@ static struct PosInfTy {
{
return std::numeric_limits<int>::max();
}
-} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;
static struct StepTy {
-} step ccl_maybe_unused;
+} step ccl_attr_maybe_unused;
#endif
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index d809f2e06d7..7df52d462b7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -24,9 +24,9 @@
CCL_NAMESPACE_BEGIN
-#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#if defined(CYCLES_CUBIN_CC)
# define static_assert(statement, message)
-#endif /* __KERNEL_OPENCL__ */
+#endif
#define static_assert_align(st, align) \
static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned") // NOLINT
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 4dfebf14923..9c0b2ca50bb 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,6 +17,9 @@
#include <stdarg.h>
#include <stdio.h>
+#include <algorithm>
+#include <cctype>
+
#include "util/util_foreach.h"
#include "util/util_string.h"
#include "util/util_windows.h"
@@ -107,24 +110,26 @@ void string_split(vector<string> &tokens,
}
}
-bool string_startswith(const string &s, const char *start)
+bool string_startswith(const string_view s, const string_view start)
{
- size_t len = strlen(start);
+ const size_t len = start.size();
- if (len > s.size())
- return 0;
- else
- return strncmp(s.c_str(), start, len) == 0;
+ if (len > s.size()) {
+ return false;
+ }
+
+ return strncmp(s.c_str(), start.data(), len) == 0;
}
-bool string_endswith(const string &s, const string &end)
+bool string_endswith(const string_view s, const string_view end)
{
- size_t len = end.length();
+ const size_t len = end.size();
- if (len > s.size())
- return 0;
- else
- return s.compare(s.length() - len, len, end) == 0;
+ if (len > s.size()) {
+ return false;
+ }
+
+ return strncmp(s.c_str() + s.size() - len, end.data(), len) == 0;
}
string string_strip(const string &s)
@@ -172,6 +177,13 @@ string to_string(const char *str)
return string(str);
}
+string string_to_lower(const string &s)
+{
+ string r = s;
+ std::transform(r.begin(), r.end(), r.begin(), [](char c) { return std::tolower(c); });
+ return r;
+}
+
/* Wide char strings helpers for Windows. */
#ifdef _WIN32
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f2272819b2f..55462cfd8b8 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,6 +21,11 @@
#include <string.h>
#include <string>
+/* Use string view implementation from OIIO.
+ * Ideally, need to switch to `std::string_view`, but this first requires getting rid of using
+ * namespace OIIO as it causes symbol collision. */
+#include <OpenImageIO/string_view.h>
+
#include "util/util_vector.h"
CCL_NAMESPACE_BEGIN
@@ -31,6 +36,8 @@ using std::string;
using std::stringstream;
using std::to_string;
+using OIIO::string_view;
+
#ifdef __GNUC__
# define PRINTF_ATTRIBUTE __attribute__((format(printf, 1, 2)))
#else
@@ -45,12 +52,13 @@ void string_split(vector<string> &tokens,
const string &separators = "\t ",
bool skip_empty_tokens = true);
void string_replace(string &haystack, const string &needle, const string &other);
-bool string_startswith(const string &s, const char *start);
-bool string_endswith(const string &s, const string &end);
+bool string_startswith(string_view s, string_view start);
+bool string_endswith(string_view s, string_view end);
string string_strip(const string &s);
string string_remove_trademark(const string &s);
string string_from_bool(const bool var);
string to_string(const char *str);
+string string_to_lower(const string &s);
/* Wide char strings are only used on Windows to deal with non-ASCII
* characters in file names and such. No reason to use such strings
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index b010881058b..be8c2fb505a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -403,4 +403,13 @@ size_t system_physical_ram()
#endif
}
+uint64_t system_self_process_id()
+{
+#ifdef _WIN32
+ return GetCurrentProcessId();
+#else
+ return getpid();
+#endif
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index c4db8b74339..a1797e6ca44 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -65,6 +65,9 @@ size_t system_physical_ram();
/* Start a new process of the current application with the given arguments. */
bool system_call_self(const vector<string> &args);
+/* Get identifier of the currently running process. */
+uint64_t system_self_process_id();
+
CCL_NAMESPACE_END
#endif /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
index 73e0f92d19c..8f84377ac8c 100644
--- a/intern/cycles/util/util_tbb.h
+++ b/intern/cycles/util/util_tbb.h
@@ -23,6 +23,7 @@
#include <tbb/enumerable_thread_specific.h>
#include <tbb/parallel_for.h>
+#include <tbb/parallel_for_each.h>
#include <tbb/task_arena.h>
#include <tbb/task_group.h>
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 71bf9c65911..4de66bf5f46 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -85,8 +85,6 @@ typedef struct TextureInfo {
uint64_t data;
/* Data Type */
uint data_type;
- /* Buffer number for OpenCL. */
- uint cl_buffer;
/* Interpolation and extension type. */
uint interpolation, extension;
/* Dimensions. */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index f79eac4cbcf..e9cd3b0b483 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -498,36 +498,12 @@ Transform transform_from_viewplane(BoundBox2D &viewplane);
#endif
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
-#ifdef __KERNEL_OPENCL__
-
-# define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a##b
-# define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
- ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
- ccl_addr_space const Transform *t, const float3 a) \
- { \
- Transform private_tfm = *t; \
- return function(&private_tfm, a); \
- }
-
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
-
-# undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
-# undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
-# define transform_point_auto transform_point_addrspace
-# define transform_direction_auto transform_direction_addrspace
-# define transform_direction_transposed_auto transform_direction_transposed_addrspace
-#else
-# define transform_point_auto transform_point
-# define transform_direction_auto transform_direction
-# define transform_direction_transposed_auto transform_direction_transposed
-#endif
+#define transform_point_auto transform_point
+#define transform_direction_auto transform_direction
+#define transform_direction_transposed_auto transform_direction_transposed
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 87358877e3c..442c32b3a3d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -17,9 +17,7 @@
#ifndef __UTIL_TYPES_H__
#define __UTIL_TYPES_H__
-#ifndef __KERNEL_OPENCL__
-# include <stdlib.h>
-#endif
+#include <stdlib.h>
/* Standard Integer Types */
@@ -44,18 +42,12 @@ CCL_NAMESPACE_BEGIN
/* Shorter Unsigned Names */
-#ifndef __KERNEL_OPENCL__
typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned short ushort;
-#endif
/* Fixed Bits Types */
-#ifdef __KERNEL_OPENCL__
-typedef unsigned long uint64_t;
-#endif
-
#ifndef __KERNEL_GPU__
/* Generic Memory Pointer */
diff --git a/intern/cycles/util/util_unique_ptr.h b/intern/cycles/util/util_unique_ptr.h
index 3aaaf083eff..3181eafd43d 100644
--- a/intern/cycles/util/util_unique_ptr.h
+++ b/intern/cycles/util/util_unique_ptr.h
@@ -21,6 +21,7 @@
CCL_NAMESPACE_BEGIN
+using std::make_unique;
using std::unique_ptr;
CCL_NAMESPACE_END